DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
This document is currently split between _v003 and v_003_a due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
Chapter 1 - Dates and Times in R
Introduction to dates - including the built-in methods for R:
Why use dates?
What about times?
Why lubridate?
Example code includes:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# The date R 3.0.0 was released
x <- "2013-04-03"
# Examine structure of x
str(x)
## chr "2013-04-03"
# Use as.Date() to interpret x as a date
x_date <- as.Date(x)
# Examine structure of x_date
str(x_date)
## Date[1:1], format: "2013-04-03"
# Store April 10 2014 as a Date
april_10_2014 <- as.Date("2014-04-10")
# Load the readr package
library(readr)
# Use read_csv() to import rversions.csv
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine the structure of the date column
str(releases$date)
## Date[1:105], format: "1997-12-04" "1997-12-21" "1998-01-10" "1998-03-14" ...
# Load the anytime package
library(anytime)
# Various ways of writing Sep 10 2009
sep_10_2009 <- c("September 10 2009", "2009-09-10", "10 Sep 2009", "09-10-2009")
# Use anytime() to parse sep_10_2009
anytime(sep_10_2009)
## [1] "2009-09-10 CDT" "2009-09-10 CDT" "2009-09-10 CDT" "2009-09-10 CDT"
# Set the x axis to the date column
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major)))
# Limit the axis to between 2010-01-01 and 2014-01-01
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
xlim(as.Date("2010-01-01"), as.Date("2014-01-01"))
## Warning: Removed 87 rows containing missing values (geom_path).
# Specify breaks every ten years and labels with "%Y"
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
scale_x_date(date_breaks = "10 years", date_labels = "%Y")
# Find the largest date
last_release_date <- max(releases$date)
# Filter row for last release
last_release <- filter(releases, date == last_release_date)
# Print last_release
last_release
## # A tibble: 1 x 7
## major minor patch date datetime time type
## <int> <int> <int> <date> <dttm> <time> <chr>
## 1 3 4 1 2017-06-30 2017-06-30 07:04:11 07:04 patch
# How long since last release?
Sys.Date() - last_release_date
## Time difference of 280 days
# Use as.POSIXct to enter the datetime
as.POSIXct("2010-10-01 12:12:00")
## [1] "2010-10-01 12:12:00 CDT"
# Use as.POSIXct again but set the timezone to `"America/Los_Angeles"`
as.POSIXct("2010-10-01 12:12:00", tz = "America/Los_Angeles")
## [1] "2010-10-01 12:12:00 PDT"
# Use read_csv to import rversions.csv
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine structure of datetime column
str(releases$datetime)
## POSIXct[1:105], format: "1997-12-04 08:47:58" "1997-12-21 13:09:22" ...
# Import "cran-logs_2015-04-17.csv" with read_csv()
logs <- read_csv("./RInputFiles/cran-logs_2015-04-17.csv")
## Parsed with column specification:
## cols(
## datetime = col_datetime(format = ""),
## r_version = col_character(),
## country = col_character()
## )
# Print logs
logs
## # A tibble: 100,000 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-16 22:40:19 3.1.3 CO
## 2 2015-04-16 09:11:04 3.1.3 GB
## 3 2015-04-16 17:12:37 3.1.3 DE
## 4 2015-04-18 12:34:43 3.2.0 GB
## 5 2015-04-16 04:49:18 3.1.3 PE
## 6 2015-04-16 06:40:44 3.1.3 TW
## 7 2015-04-16 00:21:36 3.1.3 US
## 8 2015-04-16 10:27:23 3.1.3 US
## 9 2015-04-16 01:59:43 3.1.3 SG
## 10 2015-04-18 15:41:32 3.2.0 CA
## # ... with 99,990 more rows
# Store the release time as a POSIXct object
release_time <- as.POSIXct("2015-04-16 07:13:33", tz = "UTC")
# When is the first download of 3.2.0?
logs %>%
filter(r_version == "3.2.0")
## # A tibble: 35,928 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-18 12:34:43 3.2.0 GB
## 2 2015-04-18 15:41:32 3.2.0 CA
## 3 2015-04-18 14:58:41 3.2.0 IE
## 4 2015-04-18 16:44:45 3.2.0 US
## 5 2015-04-18 04:34:35 3.2.0 US
## 6 2015-04-18 22:29:45 3.2.0 CH
## 7 2015-04-17 16:21:06 3.2.0 US
## 8 2015-04-18 20:34:57 3.2.0 AT
## 9 2015-04-17 18:23:19 3.2.0 US
## 10 2015-04-18 03:00:31 3.2.0 US
## # ... with 35,918 more rows
# Examine histograms of downloads by version
ggplot(logs, aes(x = datetime)) +
geom_histogram() +
geom_vline(aes(xintercept = as.numeric(release_time)))+
facet_wrap(~ r_version, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chapter 2 - Parsing and Manipulating Dates with lubridate
Parsing dates with lubridate:
Weather in Auckland (data from Weather Underground, METAR from Auckland airport):
Extracting parts of a datetime:
Rounding datetimes:
Example code includes:
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(readr)
library(dplyr)
library(ggplot2)
library(ggridges)
library(stringr)
# Parse x
x <- "2010 September 20th" # 2010-09-20
ymd(x)
## [1] "2010-09-20"
# Parse y
y <- "02.01.2010" # 2010-01-02
dmy(y)
## [1] "2010-01-02"
# Parse z
z <- "Sep, 12th 2010 14:00" # 2010-09-12T14:00
mdy_hm(z)
## [1] "2010-09-12 14:00:00 UTC"
# Specify an order string to parse x
x <- "Monday June 1st 2010 at 4pm"
parse_date_time(x, orders = "AmdyIp")
## [1] "2010-06-01 16:00:00 UTC"
# Specify order to include both "mdy" and "dmy"
two_orders <- c("October 7, 2001", "October 13, 2002", "April 13, 2003",
"17 April 2005", "23 April 2017")
parse_date_time(two_orders, orders = c("mdy", "dmy"))
## [1] "2001-10-07 UTC" "2002-10-13 UTC" "2003-04-13 UTC" "2005-04-17 UTC"
## [5] "2017-04-23 UTC"
# Specify order to include "dOmY", "OmY" and "Y"
short_dates <- c("11 December 1282", "May 1372", "1253")
parse_date_time(short_dates, orders = c("dOmY", "OmY", "Y"))
## [1] "1282-12-11 UTC" "1372-05-01 UTC" "1253-01-01 UTC"
# Import CSV with read_csv()
akl_daily_raw <- read_csv("./RInputFiles/akl_weather_daily.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## max_temp = col_integer(),
## min_temp = col_integer(),
## mean_temp = col_integer(),
## mean_rh = col_integer(),
## events = col_character(),
## cloud_cover = col_integer()
## )
# Print akl_daily_raw
akl_daily_raw
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <chr> <int> <int> <int> <int> <chr> <int>
## 1 2007-9-1 60 51 56 75 <NA> 4
## 2 2007-9-2 60 53 56 82 Rain 4
## 3 2007-9-3 57 51 54 78 <NA> 6
## 4 2007-9-4 64 50 57 80 Rain 6
## 5 2007-9-5 53 48 50 90 Rain 7
## 6 2007-9-6 57 42 50 69 <NA> 1
## 7 2007-9-7 59 41 50 77 <NA> 4
## 8 2007-9-8 59 46 52 80 <NA> 5
## 9 2007-9-9 55 50 52 88 Rain 7
## 10 2007-9-10 59 50 54 82 Rain 4
## # ... with 3,651 more rows
# Parse date
akl_daily <- akl_daily_raw %>%
mutate(date = ymd(date))
# Print akl_daily
akl_daily
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <date> <int> <int> <int> <int> <chr> <int>
## 1 2007-09-01 60 51 56 75 <NA> 4
## 2 2007-09-02 60 53 56 82 Rain 4
## 3 2007-09-03 57 51 54 78 <NA> 6
## 4 2007-09-04 64 50 57 80 Rain 6
## 5 2007-09-05 53 48 50 90 Rain 7
## 6 2007-09-06 57 42 50 69 <NA> 1
## 7 2007-09-07 59 41 50 77 <NA> 4
## 8 2007-09-08 59 46 52 80 <NA> 5
## 9 2007-09-09 55 50 52 88 Rain 7
## 10 2007-09-10 59 50 54 82 Rain 4
## # ... with 3,651 more rows
# Plot to check work
ggplot(akl_daily, aes(x = date, y = max_temp)) +
geom_line()
## Warning: Removed 1 rows containing missing values (geom_path).
# Import "akl_weather_hourly_2016.csv"
akl_hourly_raw <- read_csv("./RInputFiles/akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## mday = col_integer(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_integer(),
## date_utc = col_datetime(format = "")
## )
# Print akl_hourly_raw
akl_hourly_raw
## # A tibble: 17,454 x 10
## year month mday time temperature weather conditions events humidity
## <int> <int> <int> <time> <dbl> <chr> <chr> <chr> <int>
## 1 2016 1 1 00:00 68.0 Clear Clear <NA> 68
## 2 2016 1 1 00:30 68.0 Clear Clear <NA> 68
## 3 2016 1 1 01:00 68.0 Clear Clear <NA> 73
## 4 2016 1 1 01:30 68.0 Clear Clear <NA> 68
## 5 2016 1 1 02:00 68.0 Clear Clear <NA> 68
## 6 2016 1 1 02:30 68.0 Clear Clear <NA> 68
## 7 2016 1 1 03:00 68.0 Clear Clear <NA> 68
## 8 2016 1 1 03:30 68.0 Cloudy Partly Cl~ <NA> 68
## 9 2016 1 1 04:00 68.0 Cloudy Scattered~ <NA> 68
## 10 2016 1 1 04:30 66.2 Cloudy Partly Cl~ <NA> 73
## # ... with 17,444 more rows, and 1 more variable: date_utc <dttm>
# Use make_date() to combine year, month and mday
akl_hourly <- akl_hourly_raw %>%
mutate(date = make_date(year = year, month = month, day = mday))
# Parse datetime_string
akl_hourly <- akl_hourly %>%
mutate(
datetime_string = paste(date, time, sep = "T"),
datetime = ymd_hms(datetime_string)
)
# Print date, time and datetime columns of akl_hourly
akl_hourly %>% select(date, time, datetime)
## # A tibble: 17,454 x 3
## date time datetime
## <date> <time> <dttm>
## 1 2016-01-01 00:00 2016-01-01 00:00:00
## 2 2016-01-01 00:30 2016-01-01 00:30:00
## 3 2016-01-01 01:00 2016-01-01 01:00:00
## 4 2016-01-01 01:30 2016-01-01 01:30:00
## 5 2016-01-01 02:00 2016-01-01 02:00:00
## 6 2016-01-01 02:30 2016-01-01 02:30:00
## 7 2016-01-01 03:00 2016-01-01 03:00:00
## 8 2016-01-01 03:30 2016-01-01 03:30:00
## 9 2016-01-01 04:00 2016-01-01 04:00:00
## 10 2016-01-01 04:30 2016-01-01 04:30:00
## # ... with 17,444 more rows
# Plot to check work
ggplot(akl_hourly, aes(x = datetime, y = temperature)) +
geom_line()
# Examine the head() of release_time
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
release_time <- releases %>% pull(datetime)
head(release_time)
## [1] "1997-12-04 08:47:58 UTC" "1997-12-21 13:09:22 UTC"
## [3] "1998-01-10 00:31:55 UTC" "1998-03-14 19:25:55 UTC"
## [5] "1998-05-02 07:58:17 UTC" "1998-06-14 12:56:20 UTC"
# Examine the head() of the months of release_time
head(month(release_time))
## [1] 12 12 1 3 5 6
# Extract the month of releases
month(release_time) %>% table()
## .
## 1 2 3 4 5 6 7 8 9 10 11 12
## 5 6 8 18 5 16 4 7 2 15 6 13
# Extract the year of releases
year(release_time) %>% table()
## .
## 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## 2 10 9 6 6 5 5 4 4 4 4 6 5 4 6
## 2012 2013 2014 2015 2016 2017
## 4 4 4 5 5 3
# How often is the hour before 12 (noon)?
mean(hour(release_time) < 12)
## [1] 0.752381
# How often is the release in am?
mean(am(release_time))
## [1] 0.752381
# Use wday() to tabulate release by day of the week
wday(releases$datetime) %>% table()
## .
## 1 2 3 4 5 6 7
## 3 29 9 12 18 31 3
# Add label = TRUE to make table more readable
wday(releases$datetime, label=TRUE) %>% table()
## .
## Sun Mon Tue Wed Thu Fri Sat
## 3 29 9 12 18 31 3
# Create column wday to hold labelled week days
releases$wday <- wday(releases$datetime, label=TRUE)
# Plot barchart of weekday by type of release
ggplot(releases, aes(x=wday)) +
geom_bar() +
facet_wrap(~ type, ncol = 1, scale = "free_y")
# Add columns for year, yday and month
akl_daily <- akl_daily %>%
mutate(
year = year(date),
yday = yday(date),
month = month(date, label=TRUE))
# Plot max_temp by yday for all years
ggplot(akl_daily, aes(x = yday, y = max_temp)) +
geom_line(aes(group = year), alpha = 0.5)
## Warning: Removed 1 rows containing missing values (geom_path).
# Examine distribtion of max_temp by month
ggplot(akl_daily, aes(x = max_temp, y = month, height = ..density..)) +
geom_density_ridges(stat = "density")
## Warning: Removed 10 rows containing non-finite values (stat_density).
# Create new columns hour, month and rainy
akl_hourly <- akl_hourly %>%
mutate(
hour = hour(datetime),
month = month(datetime, label=TRUE),
rainy = (weather == "Precipitation")
)
# Filter for hours between 8am and 10pm (inclusive)
akl_day <- akl_hourly %>%
filter(hour >= 8, hour <= 22)
# Summarise for each date if there is any rain
rainy_days <- akl_day %>%
group_by(month, date) %>%
summarise(
any_rain = any(rainy)
)
# Summarise for each month, the number of days with rain
rainy_days %>%
summarise(
days_rainy = sum(any_rain)
)
## # A tibble: 12 x 2
## month days_rainy
## <ord> <int>
## 1 Jan 15
## 2 Feb 13
## 3 Mar 12
## 4 Apr 15
## 5 May 21
## 6 Jun 19
## 7 Jul 22
## 8 Aug 16
## 9 Sep 25
## 10 Oct 20
## 11 Nov 19
## 12 Dec 11
r_3_4_1 <- ymd_hms("2016-05-03 07:13:28 UTC")
# Round down to day
floor_date(r_3_4_1, unit = "day")
## [1] "2016-05-03 UTC"
# Round to nearest 5 minutes
round_date(r_3_4_1, unit = "5 minutes")
## [1] "2016-05-03 07:15:00 UTC"
# Round up to week
ceiling_date(r_3_4_1, unit = "week")
## [1] "2016-05-08 UTC"
# Subtract r_3_4_1 rounded down to day
r_3_4_1 - floor_date(r_3_4_1, unit = "day")
## Time difference of 7.224444 hours
# Create day_hour, datetime rounded down to hour
akl_hourly <- akl_hourly %>%
mutate(
day_hour = floor_date(datetime, unit = "hour")
)
# Count observations per hour
akl_hourly %>%
count(day_hour)
## # A tibble: 8,770 x 2
## day_hour n
## <dttm> <int>
## 1 2016-01-01 00:00:00 2
## 2 2016-01-01 01:00:00 2
## 3 2016-01-01 02:00:00 2
## 4 2016-01-01 03:00:00 2
## 5 2016-01-01 04:00:00 2
## 6 2016-01-01 05:00:00 2
## 7 2016-01-01 06:00:00 2
## 8 2016-01-01 07:00:00 2
## 9 2016-01-01 08:00:00 2
## 10 2016-01-01 09:00:00 2
## # ... with 8,760 more rows
# Find day_hours with n != 2
akl_hourly %>%
count(day_hour) %>%
filter(n != 2) %>%
arrange(desc(n))
## # A tibble: 92 x 2
## day_hour n
## <dttm> <int>
## 1 2016-04-03 02:00:00 4
## 2 2016-09-25 00:00:00 4
## 3 2016-06-26 09:00:00 1
## 4 2016-09-01 23:00:00 1
## 5 2016-09-02 01:00:00 1
## 6 2016-09-04 11:00:00 1
## 7 2016-09-04 16:00:00 1
## 8 2016-09-04 17:00:00 1
## 9 2016-09-05 00:00:00 1
## 10 2016-09-05 15:00:00 1
## # ... with 82 more rows
Chapter 3 - Arithmetic with Dates and Times
Taking differences of datetimes:
Time spans - difficult because they do not have a constant meaning (e.g., impact of daylight savings time):
Intervals - third option in lubridate for storing times:
Example code includes:
# The date of landing and moment of step
date_landing <- mdy("July 20, 1969")
moment_step <- mdy_hms("July 20, 1969, 02:56:15", tz = "UTC")
# How many days since the first man on the moon?
difftime(today(), date_landing, units = "days")
## Time difference of 17792 days
# How many seconds since the first man on the moon?
difftime(now(), moment_step, units = "secs")
## Time difference of 1537270659 secs
# Three dates
mar_11 <- ymd_hms("2017-03-11 12:00:00",
tz = "America/Los_Angeles")
mar_12 <- ymd_hms("2017-03-12 12:00:00",
tz = "America/Los_Angeles")
mar_13 <- ymd_hms("2017-03-13 12:00:00",
tz = "America/Los_Angeles")
# Difference between mar_13 and mar_12 in seconds
difftime(mar_13, mar_12, units = "secs")
## Time difference of 86400 secs
# Difference between mar_12 and mar_11 in seconds
difftime(mar_12, mar_11, units = "secs")
## Time difference of 82800 secs
# Add a period of one week to mon_2pm
mon_2pm <- dmy_hm("27 Aug 2018 14:00")
mon_2pm + weeks(1)
## [1] "2018-09-03 14:00:00 UTC"
# Add a duration of 81 hours to tue_9am
tue_9am <- dmy_hm("28 Aug 2018 9:00")
tue_9am + dhours(81)
## [1] "2018-08-31 18:00:00 UTC"
# Subtract a period of five years from today()
today() - years(5)
## [1] "2013-04-06"
# Subtract a duration of five years from today()
today() - dyears(5)
## [1] "2013-04-07"
# Time of North American Eclipse 2017
eclipse_2017 <- ymd_hms("2017-08-21 18:26:40")
# Duration of 29 days, 12 hours, 44 mins and 3 secs
synodic <- ddays(29) + dhours(12) + dminutes(44) + dseconds(3)
# 223 synodic months
saros <- 223 * synodic
# Add saros to eclipse_2017
eclipse_2017 + saros
## [1] "2035-09-02 02:09:49 UTC"
# Add a period of 8 hours to today
today_8am <- today() + hours(8)
# Sequence of two weeks from 1 to 26
every_two_weeks <- 1:26 * weeks(2)
# Create datetime for every two weeks for a year
today_8am + every_two_weeks
## [1] "2018-04-20 08:00:00 UTC" "2018-05-04 08:00:00 UTC"
## [3] "2018-05-18 08:00:00 UTC" "2018-06-01 08:00:00 UTC"
## [5] "2018-06-15 08:00:00 UTC" "2018-06-29 08:00:00 UTC"
## [7] "2018-07-13 08:00:00 UTC" "2018-07-27 08:00:00 UTC"
## [9] "2018-08-10 08:00:00 UTC" "2018-08-24 08:00:00 UTC"
## [11] "2018-09-07 08:00:00 UTC" "2018-09-21 08:00:00 UTC"
## [13] "2018-10-05 08:00:00 UTC" "2018-10-19 08:00:00 UTC"
## [15] "2018-11-02 08:00:00 UTC" "2018-11-16 08:00:00 UTC"
## [17] "2018-11-30 08:00:00 UTC" "2018-12-14 08:00:00 UTC"
## [19] "2018-12-28 08:00:00 UTC" "2019-01-11 08:00:00 UTC"
## [21] "2019-01-25 08:00:00 UTC" "2019-02-08 08:00:00 UTC"
## [23] "2019-02-22 08:00:00 UTC" "2019-03-08 08:00:00 UTC"
## [25] "2019-03-22 08:00:00 UTC" "2019-04-05 08:00:00 UTC"
jan_31 <- ymd("2018-01-31")
# A sequence of 1 to 12 periods of 1 month
month_seq <- 1:12 * months(1)
# Add 1 to 12 months to jan_31
jan_31 + month_seq
## [1] NA "2018-03-31" NA "2018-05-31" NA
## [6] "2018-07-31" "2018-08-31" NA "2018-10-31" NA
## [11] "2018-12-31" "2019-01-31"
# Replace + with %m+%
jan_31 %m+% month_seq
## [1] "2018-02-28" "2018-03-31" "2018-04-30" "2018-05-31" "2018-06-30"
## [6] "2018-07-31" "2018-08-31" "2018-09-30" "2018-10-31" "2018-11-30"
## [11] "2018-12-31" "2019-01-31"
# Replace + with %m-%
jan_31 %m-% month_seq
## [1] "2017-12-31" "2017-11-30" "2017-10-31" "2017-09-30" "2017-08-31"
## [6] "2017-07-31" "2017-06-30" "2017-05-31" "2017-04-30" "2017-03-31"
## [11] "2017-02-28" "2017-01-31"
# Create monarchs
mNames <- c('Elizabeth II' ,'Victoria' ,'George V' ,'George III' ,'George VI' ,'George IV' ,'Edward VII' ,'William IV' ,'Edward VIII' ,'George III(also United Kingdom)' ,'George II' ,'George I' ,'Anne' ,'Henry III' ,'Edward III' ,'Elizabeth I' ,'Henry VI' ,'Henry VI' ,'Æthelred II' ,'Æthelred II' ,'Henry VIII' ,'Charles II' ,'Henry I' ,'Henry II(co-ruler with Henry the Young King)' ,'Edward I' ,'Alfred the Great' ,'Edward the Elder' ,'Charles I' ,'Henry VII' ,'Edward the Confessor' ,'Richard II' ,'James I' ,'Edward IV' ,'Edward IV' ,'William I' ,'Edward II' ,'Cnut' ,'Stephen' ,'Stephen' ,'John' ,'Edgar I' ,'Æthelstan' ,'Henry IV' ,'William III(co-ruler with Mary II)' ,'Henry the Young King(co-ruler with Henry II)' ,'William II' ,'Richard I' ,'Eadred' ,'Henry V' ,'Edmund I' ,'Edward VI' ,'Mary II(co-ruler with William III)' ,'Mary I' ,'Anne(also Kingdom of Great Britain)' ,'Eadwig' ,'James II' ,'Edward the Martyr' ,'Harold I' ,'Harthacnut' ,'Richard III' ,'Louis (disputed)' ,'Harold II' ,'Edmund II' ,'Matilda (disputed)' ,'Edward V' ,'Edgar II' ,'Sweyn Forkbeard' ,'Jane (disputed)' ,'James VI' ,'William I' ,'Constantine II' ,'David II' ,'Alexander III' ,'Malcolm III' ,'Alexander II' ,'James I' ,'Malcolm II' ,'James V' ,'David I' ,'James III' ,'Charles II' ,'Charles II' ,'James IV' ,'Mary I' ,'Charles I' ,'Kenneth II' ,'James II' ,'Robert I' ,'Robert II' ,'Alexander I' ,'Macbeth' ,'Robert III' ,'Constantine I' ,'Kenneth MacAlpin' ,'William II' ,'Malcolm IV' ,'Giric(co-ruler with Eochaid?)' ,'Donald II' ,'Malcolm I' ,'Edgar' ,'Kenneth III' ,'Indulf' ,'Duncan I' ,'Mary II' ,'Amlaíb' ,'Anne(also Kingdom of Great Britain)' ,'Dub' ,'Cuilén' ,'Domnall mac Ailpín' ,'James VII' ,'Margaret' ,'John Balliol' ,'Donald III' ,'Constantine III' ,'Áed mac Cináeda' ,'Lulach' ,'Duncan II' ,'Ruaidrí Ua Conchobair' ,'Edward Bruce (disputed)' ,'Brian Ua Néill (disputed)' ,'Gruffudd ap Cynan' ,'Llywelyn the Great' ,'Owain Gwynedd' ,'Dafydd ab Owain Gwynedd' ,'Hywel ab Owain Gwynedd' ,'Llywelyn ap Gruffudd' ,'Owain Glyndŵr (disputed)' ,'Owain Goch ap Gruffydd' ,'Owain Lawgoch (disputed)' ,'Dafydd ap Llywelyn' ,'Dafydd ap Gruffydd')
mDominion <- c('United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'Great Britain' ,'Great Britain' ,'Great Britain' ,'Great Britain' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Ireland' ,'Ireland' ,'Ireland' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Wales' ,'Wales' ,'Wales' ,'Wales' ,'Wales' ,'Wales')
mFrom <- c('1952-02-06' ,'1837-06-20' ,'1910-05-06' ,'1801-01-01' ,'1936-12-11' ,'1820-01-29' ,'1901-01-22' ,'1830-06-26' ,'1936-01-20' ,'1760-10-25' ,'1727-06-22' ,'1714-08-01' ,'1707-05-01' ,'NA' ,'1327-01-25' ,'1558-11-17' ,'1422-08-31' ,'1470-10-31' ,'978-03-18' ,'1014-02-03' ,'1509-04-22' ,'1649-01-30' ,'1100-08-03' ,'1154-10-25' ,'1272-11-20' ,'871-04-24' ,'899-10-27' ,'1625-03-27' ,'1485-08-22' ,'1042-06-08' ,'1377-06-22' ,'1603-03-24' ,'1461-03-04' ,'1471-04-11' ,'1066-12-12' ,'1307-07-07' ,'1016-11-30' ,'1135-12-22' ,'1141-11-01' ,'1199-04-06' ,'959-10-01' ,'924-08-02' ,'1399-09-29' ,'1689-02-13' ,'1170-06-14' ,'1087-09-09' ,'1189-07-06' ,'946-05-26' ,'1413-03-21' ,'939-10-27' ,'1547-01-28' ,'1689-02-13' ,'1553-07-19' ,'1702-03-08' ,'955-11-23' ,'1685-02-06' ,'975-07-09' ,'1037-11-12' ,'1040-03-17' ,'1483-06-26' ,'1216-06-14' ,'1066-01-05' ,'1016-04-23' ,'1141-04-07' ,'1483-04-09' ,'1066-10-15' ,'1013-12-25' ,'1553-07-10' ,'1567-07-24' ,'1165-12-09' ,'900-01-01' ,'1329-06-07' ,'1249-07-06' ,'1058-03-17' ,'1214-12-04' ,'1406-04-04' ,'1005-03-25' ,'1513-09-09' ,'1124-04-23' ,'1460-08-03' ,'1649-01-30' ,'1660-05-29' ,'1488-06-11' ,'1542-12-14' ,'1625-03-27' ,'971-01-01' ,'1437-02-21' ,'1306-03-25' ,'1371-02-22' ,'1107-01-08' ,'1040-08-14' ,'1390-04-19' ,'862-01-01' ,'843-01-01' ,'1689-05-11' ,'1153-05-24' ,'878-01-01' ,'889-01-01' ,'943-01-01' ,'1097-01-01' ,'997-01-01' ,'954-01-01' ,'1034-11-25' ,'1689-04-11' ,'971-01-01' ,'1702-03-08' ,'962-01-01' ,'NA' ,'858-01-01' ,'1685-02-06' ,'1286-11-25' ,'1292-11-17' ,'1093-11-13' ,'1095-01-01' ,'877-01-01' ,'1057-08-15' ,'1094-05-01' ,'1166-01-01' ,'1315-06-01' ,'1258-01-01' ,'1081-01-01' ,'1195-01-01' ,'1137-01-01' ,'1170-01-01' ,'1170-01-01' ,'1253-01-01' ,'1400-09-16' ,'1246-02-25' ,'1372-05-01' ,'1240-04-12' ,'1282-12-11')
mTo <- c('2018-02-08' ,'1901-01-22' ,'1936-01-20' ,'1820-01-29' ,'1952-02-06' ,'1830-06-26' ,'1910-05-06' ,'1837-06-20' ,'1936-12-11' ,'1801-01-01' ,'1760-10-25' ,'1727-06-11' ,'1714-08-01' ,'1272-11-16' ,'1377-06-21' ,'1603-03-24' ,'1461-03-04' ,'1471-04-11' ,'1013-12-25' ,'1016-04-23' ,'1547-01-28' ,'1685-02-06' ,'1135-12-01' ,'1189-07-06' ,'1307-07-07' ,'899-10-26' ,'924-07-17' ,'1649-01-30' ,'1509-04-21' ,'1066-01-05' ,'1399-09-29' ,'1625-03-27' ,'1470-10-03' ,'1483-04-09' ,'1087-09-09' ,'1327-01-20' ,'1035-11-12' ,'1141-04-07' ,'1154-10-25' ,'1216-10-19' ,'975-07-08' ,'939-10-27' ,'1413-03-20' ,'1702-03-08' ,'1183-06-11' ,'1100-08-02' ,'1199-04-06' ,'955-11-23' ,'1422-08-31' ,'946-05-26' ,'1553-07-06' ,'1694-12-28' ,'1558-11-17' ,'1707-04-30' ,'959-10-01' ,'1688-12-11' ,'978-03-18' ,'1040-03-17' ,'1042-06-08' ,'1485-08-22' ,'1217-09-22' ,'1066-10-14' ,'1016-11-30' ,'1141-11-01' ,'1483-06-26' ,'1066-12-17' ,'1014-02-03' ,'1553-07-19' ,'1625-03-27' ,'1214-12-04' ,'943-01-01' ,'1371-02-22' ,'1286-03-19' ,'1093-11-13' ,'1249-07-06' ,'1437-02-21' ,'1034-11-25' ,'1542-12-14' ,'1153-05-24' ,'1488-06-11' ,'1651-09-03' ,'1685-02-06' ,'1513-09-09' ,'1567-07-24' ,'1649-01-30' ,'995-01-01' ,'1460-08-03' ,'1329-06-07' ,'1390-04-19' ,'1124-04-23' ,'1057-08-15' ,'1406-04-04' ,'877-01-01' ,'858-02-13' ,'1702-03-08' ,'1165-12-09' ,'889-01-01' ,'900-01-01' ,'954-01-01' ,'1107-01-08' ,'1005-03-25' ,'962-01-01' ,'1040-08-14' ,'1694-12-28' ,'977-01-01' ,'1707-04-30' ,'NA' ,'971-01-01' ,'862-04-13' ,'1688-12-11' ,'1290-09-26' ,'1296-07-10' ,'1097-01-01' ,'1097-01-01' ,'878-01-01' ,'1058-03-17' ,'1094-11-12' ,'1193-01-01' ,'1318-10-14' ,'1260-01-01' ,'1137-01-01' ,'1240-04-11' ,'1170-01-01' ,'1195-01-01' ,'1170-01-01' ,'1282-12-11' ,'1416-01-01' ,'1255-01-01' ,'1378-07-01' ,'1246-02-25' ,'1283-10-03')
padMDate <- function(x) {
if (is.na(x[1]) | x[1] == "NA") {
NA
} else {
paste0(c(str_pad(x[1], 4, pad="0"), x[2], x[3]), collapse="-")
}
}
monarchs <- tibble::tibble(name=mNames, dominion=mDominion,
from=ymd(sapply(str_split(mFrom, "-"), FUN=padMDate)),
to=ymd(sapply(str_split(mTo, "-"), FUN=padMDate))
)
# Print monarchs
monarchs
## # A tibble: 131 x 4
## name dominion from to
## <chr> <chr> <date> <date>
## 1 Elizabeth II United Kingdom 1952-02-06 2018-02-08
## 2 Victoria United Kingdom 1837-06-20 1901-01-22
## 3 George V United Kingdom 1910-05-06 1936-01-20
## 4 George III United Kingdom 1801-01-01 1820-01-29
## 5 George VI United Kingdom 1936-12-11 1952-02-06
## 6 George IV United Kingdom 1820-01-29 1830-06-26
## 7 Edward VII United Kingdom 1901-01-22 1910-05-06
## 8 William IV United Kingdom 1830-06-26 1837-06-20
## 9 Edward VIII United Kingdom 1936-01-20 1936-12-11
## 10 George III(also United Kingdom) Great Britain 1760-10-25 1801-01-01
## # ... with 121 more rows
# Create an interval for reign
monarchs <- monarchs %>%
mutate(reign = from %--% to)
# Find the length of reign, and arrange
monarchs %>%
mutate(length = int_length(reign)) %>%
arrange(desc(length)) %>%
select(name, length, dominion)
## # A tibble: 131 x 3
## name length dominion
## <chr> <dbl> <chr>
## 1 Elizabeth II 2083017600 United Kingdom
## 2 Victoria 2006726400 United Kingdom
## 3 James VI 1820102400 Scotland
## 4 Gruffudd ap Cynan 1767139200 Gwynedd
## 5 Edward III 1590624000 England
## 6 William I 1545868800 Scotland
## 7 Llywelyn the Great 1428796800 Gwynedd
## 8 Elizabeth I 1399507200 England
## 9 Constantine II 1356912000 Scotland
## 10 David II 1316304000 Scotland
## # ... with 121 more rows
# Print halleys
pDate <- c('66-01-26', '141-03-25', '218-04-06', '295-04-07', '374-02-13', '451-07-03', '530-11-15', '607-03-26', '684-11-26', '760-06-10', '837-02-25', '912-07-27', '989-09-02', '1066-03-25', '1145-04-19', '1222-09-10', '1301-10-22', '1378-11-09', '1456-01-08', '1531-08-26', '1607-10-27', '1682-09-15', '1758-03-13', '1835-11-16', '1910-04-20', '1986-02-09', '2061-07-28')
sDate <- c('66-01-25', '141-03-22', '218-04-06', '295-04-07', '374-02-13', '451-06-28', '530-09-27', '607-03-15', '684-10-02', '760-05-20', '837-02-25', '912-07-18', '989-09-02', '1066-01-01', '1145-04-15', '1222-09-10', '1301-10-22', '1378-11-09', '1456-01-08', '1531-08-26', '1607-10-27', '1682-09-15', '1758-03-13', '1835-08-01', '1910-04-20', '1986-02-09', '2061-07-28')
eDate <- c('66-01-26', '141-03-25', '218-05-17', '295-04-20', '374-02-16', '451-07-03', '530-11-15', '607-03-26', '684-11-26', '760-06-10', '837-02-28', '912-07-27', '989-09-05', '1066-03-25', '1145-04-19', '1222-09-28', '1301-10-31', '1378-11-14', '1456-06-09', '1531-08-26', '1607-10-27', '1682-09-15', '1758-12-25', '1835-11-16', '1910-05-20', '1986-02-09', '2061-07-28')
halleys <- tibble::tibble(perihelion_date=ymd(sapply(str_split(pDate, "-"), FUN=padMDate)),
start_date=ymd(sapply(str_split(sDate, "-"), FUN=padMDate)),
end_date=ymd(sapply(str_split(eDate, "-"), FUN=padMDate))
)
# New column for interval from start to end date
halleys <- halleys %>%
mutate(visible = start_date %--% end_date)
# The visitation of 1066
halleys_1066 <- halleys[14, ]
# Monarchs in power on perihelion date
monarchs %>%
filter(halleys_1066$perihelion_date %within% reign) %>%
select(name, from, to, dominion)
## # A tibble: 2 x 4
## name from to dominion
## <chr> <date> <date> <chr>
## 1 Harold II 1066-01-05 1066-10-14 England
## 2 Malcolm III 1058-03-17 1093-11-13 Scotland
# Monarchs whose reign overlaps visible time
monarchs %>%
filter(int_overlaps(halleys_1066$visible, reign)) %>%
select(name, from, to, dominion)
## # A tibble: 3 x 4
## name from to dominion
## <chr> <date> <date> <chr>
## 1 Edward the Confessor 1042-06-08 1066-01-05 England
## 2 Harold II 1066-01-05 1066-10-14 England
## 3 Malcolm III 1058-03-17 1093-11-13 Scotland
# New columns for duration and period
monarchs <- monarchs %>%
mutate(
duration = as.duration(reign),
period = as.period(reign))
# Examine results
monarchs %>%
select(name, duration, period) %>%
head(10) %>%
print.data.frame()
## name duration
## 1 Elizabeth II 2083017600s (~66.01 years)
## 2 Victoria 2006726400s (~63.59 years)
## 3 George V 811296000s (~25.71 years)
## 4 George III 601948800s (~19.07 years)
## 5 George VI 478224000s (~15.15 years)
## 6 George IV 328406400s (~10.41 years)
## 7 Edward VII 292982400s (~9.28 years)
## 8 William IV 220406400s (~6.98 years)
## 9 Edward VIII 28166400s (~46.57 weeks)
## 10 George III(also United Kingdom) 1268092800s (~40.18 years)
## period
## 1 66y 0m 2d 0H 0M 0S
## 2 63y 7m 2d 0H 0M 0S
## 3 25y 8m 14d 0H 0M 0S
## 4 19y 0m 28d 0H 0M 0S
## 5 15y 1m 26d 0H 0M 0S
## 6 10y 4m 28d 0H 0M 0S
## 7 9y 3m 14d 0H 0M 0S
## 8 6y 11m 25d 0H 0M 0S
## 9 10m 21d 0H 0M 0S
## 10 40y 2m 7d 0H 0M 0S
Chapter 4 - Problems in Practice
Time zones - ways to keep track of times in different locations (can pose analysis challenges):
Importing and exporting datetimes:
Wrap-up:
Example code includes:
# Game2: CAN vs NZL in Edmonton
game2 <- mdy_hm("June 11 2015 19:00")
# Game3: CHN vs NZL in Winnipeg
game3 <- mdy_hm("June 15 2015 18:30")
# Set the timezone to "America/Edmonton"
game2_local <- force_tz(game2, tzone = "America/Edmonton")
game2_local
## [1] "2015-06-11 19:00:00 MDT"
# Set the timezone to "America/Winnipeg"
game3_local <- force_tz(game3, tzone = "America/Winnipeg")
game3_local
## [1] "2015-06-15 18:30:00 CDT"
# How long does the team have to rest?
as.period(game2_local %--% game3_local)
## [1] "3d 22H 30M 0S"
# What time is game2_local in NZ?
with_tz(game2_local, tzone = "Pacific/Auckland")
## [1] "2015-06-12 13:00:00 NZST"
# What time is game2_local in Corvallis, Oregon?
with_tz(game2_local, tzone = "America/Los_Angeles")
## [1] "2015-06-11 18:00:00 PDT"
# What time is game3_local in NZ?
with_tz(game3_local, tzone = "Pacific/Auckland")
## [1] "2015-06-16 11:30:00 NZST"
# Examine datetime and date_utc columns
head(akl_hourly$datetime)
## [1] "2016-01-01 00:00:00 UTC" "2016-01-01 00:30:00 UTC"
## [3] "2016-01-01 01:00:00 UTC" "2016-01-01 01:30:00 UTC"
## [5] "2016-01-01 02:00:00 UTC" "2016-01-01 02:30:00 UTC"
head(akl_hourly$date_utc)
## [1] "2015-12-31 11:00:00 UTC" "2015-12-31 11:30:00 UTC"
## [3] "2015-12-31 12:00:00 UTC" "2015-12-31 12:30:00 UTC"
## [5] "2015-12-31 13:00:00 UTC" "2015-12-31 13:30:00 UTC"
# Force datetime to Pacific/Auckland
akl_hourly <- akl_hourly %>%
mutate(
datetime = force_tz(datetime, tzone = "Pacific/Auckland"))
# Reexamine datetime
head(akl_hourly$datetime)
## [1] "2016-01-01 00:00:00 NZDT" "2016-01-01 00:30:00 NZDT"
## [3] "2016-01-01 01:00:00 NZDT" "2016-01-01 01:30:00 NZDT"
## [5] "2016-01-01 02:00:00 NZDT" "2016-01-01 02:30:00 NZDT"
# Are datetime and date_utc the same moments
table(akl_hourly$datetime - akl_hourly$date_utc)
##
## -82800 0 3600
## 2 17450 2
# Import auckland hourly data
akl_hourly <- read_csv("./RInputFiles/akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## mday = col_integer(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_integer(),
## date_utc = col_datetime(format = "")
## )
# Examine structure of time column
str(akl_hourly$time)
## Classes 'hms', 'difftime' atomic [1:17454] 0 1800 3600 5400 7200 9000 10800 12600 14400 16200 ...
## ..- attr(*, "units")= chr "secs"
# Examine head of time column
head(akl_hourly$time)
## 00:00:00
## 00:30:00
## 01:00:00
## 01:30:00
## 02:00:00
## 02:30:00
# A plot using just time
ggplot(akl_hourly, aes(x = time, y = temperature)) +
geom_line(aes(group = make_date(year, month, mday)), alpha = 0.2)
library(microbenchmark)
library(fasttime)
# Examine structure of dates
dates <- paste0(gsub(" ", "T", as.character(akl_hourly$date_utc)), "Z")
str(dates)
## chr [1:17454] "2015-12-31T11:00:00Z" "2015-12-31T11:30:00Z" ...
# Use fastPOSIXct() to parse dates
fastPOSIXct(dates) %>% str()
## POSIXct[1:17454], format: "2015-12-31 05:00:00" "2015-12-31 05:30:00" ...
# Compare speed of fastPOSIXct() to ymd_hms()
microbenchmark(
ymd_hms = ymd_hms(dates),
fasttime = fastPOSIXct(dates),
times = 20)
## Unit: milliseconds
## expr min lq mean median uq max neval
## ymd_hms 20.387000 23.975696 27.81115 26.755940 30.550697 39.83252 20
## fasttime 1.699805 1.782901 2.66241 1.910208 2.699714 10.55411 20
## cld
## b
## a
# Head of dates
head(dates)
## [1] "2015-12-31T11:00:00Z" "2015-12-31T11:30:00Z" "2015-12-31T12:00:00Z"
## [4] "2015-12-31T12:30:00Z" "2015-12-31T13:00:00Z" "2015-12-31T13:30:00Z"
# Parse dates with fast_strptime
fast_strptime(dates,
format = "%Y-%m-%dT%H:%M:%SZ") %>% str()
## POSIXlt[1:17454], format: "2015-12-31 11:00:00" "2015-12-31 11:30:00" ...
# Comparse speed to ymd_hms() and fasttime
microbenchmark(
ymd_hms = ymd_hms(dates),
fasttime = fastPOSIXct(dates),
fast_strptime = fast_strptime(dates,
format = "%Y-%m-%dT%H:%M:%SZ"),
times = 20)
## Unit: milliseconds
## expr min lq mean median uq
## ymd_hms 20.495557 24.935932 40.443395 26.790282 34.311110
## fasttime 1.639803 1.703161 2.038030 1.790993 1.973172
## fast_strptime 1.361897 1.429203 2.072649 1.587893 1.868167
## max neval cld
## 232.550750 20 b
## 3.676332 20 a
## 8.872856 20 a
finished <- "I finished 'Dates and Times in R' on Thursday, September 20, 2017!"
# Create a stamp based on "Sep 20 2017"
date_stamp <- stamp("September 20, 2017", orders="mdy")
## Multiple formats matched: "%Om %d, %Y"(1), "%B %d, %Y"(1)
## Using: "%B %d, %Y"
# Print date_stamp
date_stamp
## function (x, locale = "English_United States.1252")
## {
## {
## old_lc_time <- Sys.getlocale("LC_TIME")
## if (old_lc_time != locale) {
## Sys.setlocale("LC_TIME", locale)
## on.exit(Sys.setlocale("LC_TIME", old_lc_time))
## }
## }
## format(x, format = "%B %d, %Y")
## }
## <environment: 0x00000000119df538>
# Call date_stamp on today()
date_stamp(today())
## [1] "April 06, 2018"
# Create and call a stamp based on "09/20/2017"
stamp("09/20/2017", orders="mdy")(today())
## Multiple formats matched: "%Om/%d/%Y"(1), "%m/%d/%Y"(1)
## Using: "%Om/%d/%Y"
## [1] "04/06/2018"
# Use string finished for stamp()
stamp(finished, orders="amdy")(today())
## Multiple formats matched: "I finished 'Dates and Times in R' on %A, %B %d, %Y!"(1), "I finished 'Dates and Times in R' on %A, %Om %d, %Y!"(0)
## Using: "I finished 'Dates and Times in R' on %A, %B %d, %Y!"
## [1] "I finished 'Dates and Times in R' on Friday, April 06, 2018!"
Chapter 1 - Working with Increasingly Large Data Sets
What is scalable data processing?:
Working with “out of core” objects using the Bigmemory Project:
References vs. Copies:
Example code includes:
# Load the microbenchmark package
library(microbenchmark)
# Compare the timings for sorting different sizes of vector
mb <- microbenchmark(
# Sort a random normal vector length 1e5
"1e5" = sort(rnorm(1e5)),
# Sort a random normal vector length 2.5e5
"2.5e5" = sort(rnorm(2.5e5)),
# Sort a random normal vector length 5e5
"5e5" = sort(rnorm(5e5)),
"7.5e5" = sort(rnorm(7.5e5)),
"1e6" = sort(rnorm(1e6)),
times = 10
)
# Plot the resulting benchmark object
plot(mb)
# Load the bigmemory package
library(bigmemory)
# Create the big.matrix object: x
x <- read.big.matrix("./RInputFiles/mortgage-sample.csv", header = TRUE,
type = "integer",
backingfile = "mortgage-sample.bin",
descriptorfile = "mortgage-sample.desc")
# Find the dimensions of x
dim(x)
## [1] 70000 16
# Attach mortgage-sample.desc
mort <- attach.big.matrix("mortgage-sample.desc")
# Find the dimensions of mort
dim(mort)
## [1] 70000 16
# Look at the first 6 rows of mort
head(mort)
## enterprise record_number msa perc_minority tract_income_ratio
## [1,] 1 566 1 1 3
## [2,] 1 116 1 3 2
## [3,] 1 239 1 2 2
## [4,] 1 62 1 2 3
## [5,] 1 106 1 2 3
## [6,] 1 759 1 3 3
## borrower_income_ratio loan_purpose federal_guarantee borrower_race
## [1,] 1 2 4 3
## [2,] 1 2 4 5
## [3,] 3 8 4 5
## [4,] 3 2 4 5
## [5,] 3 2 4 9
## [6,] 2 2 4 9
## co_borrower_race borrower_gender co_borrower_gender num_units
## [1,] 9 2 4 1
## [2,] 9 1 4 1
## [3,] 5 1 2 1
## [4,] 9 2 4 1
## [5,] 9 3 4 1
## [6,] 9 1 2 2
## affordability year type
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
## [4,] 4 2009 1
## [5,] 4 2013 1
## [6,] 4 2010 1
# Create mort
mort <- attach.big.matrix("mortgage-sample.desc")
# Look at the first 3 rows
mort[1:3, ]
## enterprise record_number msa perc_minority tract_income_ratio
## [1,] 1 566 1 1 3
## [2,] 1 116 1 3 2
## [3,] 1 239 1 2 2
## borrower_income_ratio loan_purpose federal_guarantee borrower_race
## [1,] 1 2 4 3
## [2,] 1 2 4 5
## [3,] 3 8 4 5
## co_borrower_race borrower_gender co_borrower_gender num_units
## [1,] 9 2 4 1
## [2,] 9 1 4 1
## [3,] 5 1 2 1
## affordability year type
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
# Create a table of the number of mortgages for each year in the data set
table(mort[, "year"])
##
## 2008 2009 2010 2011 2012 2013 2014 2015
## 8468 11101 8836 7996 10935 10216 5714 6734
a <- getLoadedDLLs()
length(a)
## [1] 39
R.utils::gcDLLs()
## named list()
a <- getLoadedDLLs()
length(a)
## [1] 39
# Load the biganalytics package (error in loading to Knit file, works OK otherwise)
library(biganalytics)
## Loading required package: foreach
## Loading required package: biglm
## Loading required package: DBI
# Get the column means of mort
colmean(mort)
## enterprise record_number msa
## 1.3814571 499.9080571 0.8943571
## perc_minority tract_income_ratio borrower_income_ratio
## 1.9701857 2.3431571 2.6898857
## loan_purpose federal_guarantee borrower_race
## 3.7670143 3.9840857 5.3572429
## co_borrower_race borrower_gender co_borrower_gender
## 7.0002714 1.4590714 3.0494857
## num_units affordability year
## 1.0398143 4.2863429 2011.2714714
## type
## 0.5300429
# Use biganalytics' summary function to get a summary of the data
summary(mort)
## min max mean NAs
## enterprise 1.0000000 2.0000000 1.3814571 0.0000000
## record_number 0.0000000 999.0000000 499.9080571 0.0000000
## msa 0.0000000 1.0000000 0.8943571 0.0000000
## perc_minority 1.0000000 9.0000000 1.9701857 0.0000000
## tract_income_ratio 1.0000000 9.0000000 2.3431571 0.0000000
## borrower_income_ratio 1.0000000 9.0000000 2.6898857 0.0000000
## loan_purpose 1.0000000 9.0000000 3.7670143 0.0000000
## federal_guarantee 1.0000000 4.0000000 3.9840857 0.0000000
## borrower_race 1.0000000 9.0000000 5.3572429 0.0000000
## co_borrower_race 1.0000000 9.0000000 7.0002714 0.0000000
## borrower_gender 1.0000000 9.0000000 1.4590714 0.0000000
## co_borrower_gender 1.0000000 9.0000000 3.0494857 0.0000000
## num_units 1.0000000 4.0000000 1.0398143 0.0000000
## affordability 0.0000000 9.0000000 4.2863429 0.0000000
## year 2008.0000000 2015.0000000 2011.2714714 0.0000000
## type 0.0000000 1.0000000 0.5300429 0.0000000
# Use deepcopy() to create first_three
first_three <- deepcopy(mort, cols = 1:3,
backingfile = "first_three.bin",
descriptorfile = "first_three.desc")
# Set first_three_2 equal to first_three
first_three_2 <- first_three
# Set the value in the first row and first column of first_three to NA
first_three[1, 1] <- NA
# Verify the change shows up in first_three_2
first_three_2[1, 1]
## [1] NA
# but not in mort
mort[1, 1]
## [1] 1
Chapter 2 - Processing and Analyzing Data with bigmemory
The Bigmemory Suite of Packages:
Split-Apply-Combine (aka Split-Compute-Combine), run in this course using split() Map() Reduce():
Visualize results using tidyverse:
Limitations of bigmemory - process is useful for dense, numeric matrices that can be stored on hard disk:
Example code includes:
library(bigtabulate)
library(tidyr)
library(ggplot2)
library(biganalytics)
library(dplyr)
race_cat <- c('Native Am', 'Asian', 'Black', 'Pacific Is', 'White', 'Two or More', 'Hispanic', 'Not Avail')
# Call bigtable to create a variable called race_table
race_table <- bigtable(mort, "borrower_race")
# Rename the elements of race_table
names(race_table) <- race_cat
race_table
## Native Am Asian Black Pacific Is White Two or More
## 143 4438 2020 195 50006 528
## Hispanic Not Avail
## 4040 8630
# Create a table of the borrower race by year
race_year_table <- bigtable(mort, c("borrower_race", "year"))
# Convert rydf to a data frame
rydf <- as.data.frame(race_year_table)
# Create the new column Race
rydf$Race <- race_cat
# Let's see what it looks like
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
## 9 1505 1755 1240 1013 1009 971 519 618 Not Avail
female_residence_prop <- function(x, rows) {
x_subset <- x[rows, ]
# Find the proporation of female borrowers in urban areas
prop_female_urban <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 1) /
sum(x_subset[, "msa"] == 1)
# Find the proporation of female borrowers in rural areas
prop_female_rural <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 0) /
sum(x_subset[, "msa"] == 0)
c(prop_female_urban, prop_female_rural)
}
# Find the proportion of female borrowers in 2015
female_residence_prop(mort, mort[, "year"] == 2015)
## [1] 0.2737439 0.2304965
# Split the row numbers of the mortage data by year
spl <- split(1:nrow(mort), mort[, "year"])
# Call str on spl
str(spl)
## List of 8
## $ 2008: int [1:8468] 2 8 15 17 18 28 35 40 42 47 ...
## $ 2009: int [1:11101] 4 13 25 31 43 49 52 56 67 68 ...
## $ 2010: int [1:8836] 1 6 7 10 21 23 24 27 29 38 ...
## $ 2011: int [1:7996] 11 20 37 46 53 57 73 83 86 87 ...
## $ 2012: int [1:10935] 14 16 26 30 32 33 48 69 81 94 ...
## $ 2013: int [1:10216] 5 9 19 22 36 44 55 58 72 74 ...
## $ 2014: int [1:5714] 3 12 50 60 64 66 103 114 122 130 ...
## $ 2015: int [1:6734] 34 41 54 61 62 65 82 91 102 135 ...
# For each of the row splits, find the female residence proportion
all_years <- Map(function(rows) female_residence_prop(mort, rows), spl)
# Call str on all_years
str(all_years)
## List of 8
## $ 2008: num [1:2] 0.275 0.204
## $ 2009: num [1:2] 0.244 0.2
## $ 2010: num [1:2] 0.241 0.201
## $ 2011: num [1:2] 0.252 0.241
## $ 2012: num [1:2] 0.244 0.21
## $ 2013: num [1:2] 0.275 0.257
## $ 2014: num [1:2] 0.289 0.268
## $ 2015: num [1:2] 0.274 0.23
# Collect the results as rows in a matrix
prop_female <- Reduce(rbind, all_years)
# Rename the row and column names
dimnames(prop_female) <- list(names(all_years), c("prop_female_urban", "prop_femal_rural"))
# View the matrix
prop_female
## prop_female_urban prop_femal_rural
## 2008 0.2748514 0.2039474
## 2009 0.2441074 0.2002978
## 2010 0.2413881 0.2014028
## 2011 0.2520644 0.2408931
## 2012 0.2438950 0.2101313
## 2013 0.2751059 0.2567164
## 2014 0.2886756 0.2678571
## 2015 0.2737439 0.2304965
# Convert prop_female to a data frame
prop_female_df <- as.data.frame(prop_female)
# Add a new column Year
prop_female_df$Year <- row.names(prop_female_df)
# Call gather on prop_female_df
prop_female_long <- gather(prop_female_df, Region, Prop, -Year)
# Create a line plot
ggplot(prop_female_long, aes(x = Year, y = Prop, group = Region, color = Region)) +
geom_line()
# Call summary on mort
summary(mort)
## min max mean NAs
## enterprise 1.0000000 2.0000000 1.3814571 0.0000000
## record_number 0.0000000 999.0000000 499.9080571 0.0000000
## msa 0.0000000 1.0000000 0.8943571 0.0000000
## perc_minority 1.0000000 9.0000000 1.9701857 0.0000000
## tract_income_ratio 1.0000000 9.0000000 2.3431571 0.0000000
## borrower_income_ratio 1.0000000 9.0000000 2.6898857 0.0000000
## loan_purpose 1.0000000 9.0000000 3.7670143 0.0000000
## federal_guarantee 1.0000000 4.0000000 3.9840857 0.0000000
## borrower_race 1.0000000 9.0000000 5.3572429 0.0000000
## co_borrower_race 1.0000000 9.0000000 7.0002714 0.0000000
## borrower_gender 1.0000000 9.0000000 1.4590714 0.0000000
## co_borrower_gender 1.0000000 9.0000000 3.0494857 0.0000000
## num_units 1.0000000 4.0000000 1.0398143 0.0000000
## affordability 0.0000000 9.0000000 4.2863429 0.0000000
## year 2008.0000000 2015.0000000 2011.2714714 0.0000000
## type 0.0000000 1.0000000 0.5300429 0.0000000
bir_df_wide <- bigtable(mort, c("borrower_income_ratio", "year")) %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
filter(rowname %in% c(1, 2, 3)) %>%
select(-rowname) %>%
# Create a new column called BIR with the corresponding table categories
mutate(BIR = c(">=0,<=50%", ">50, <=80%", ">80%"))
bir_df_wide
## 2008 2009 2010 2011 2012 2013 2014 2015 BIR
## 1 1205 1473 600 620 745 725 401 380 >=0,<=50%
## 2 2095 2791 1554 1421 1819 1861 1032 1145 >50, <=80%
## 3 4844 6707 6609 5934 8338 7559 4255 5169 >80%
bir_df_wide %>%
# Transform the wide-formatted data.frame into the long format
gather(Year, Count, -BIR) %>%
# Use ggplot to create a line plot
ggplot(aes(x = Year, y = Count, group = BIR, color = BIR)) +
geom_line()
Chapter 3 - Working with iotools
Introduction to chunk-wise processing - solution to challenges from bigmemory:
First look at iotools: Importing data:
Using chunk.apply - effectively moves away from what is functionally a “for loop” to allow better parallel processing:
Example code includes:
foldable_range <- function(x) {
if (is.list(x)) {
# If x is a list then reduce it by the min and max of each element in the list
c(Reduce(min, x), Reduce(max, x))
} else {
# Otherwise, assume it's a vector and find it's range
range(x)
}
}
# Verify that foldable_range() works on the record_number column
foldable_range(mort[, "record_number"])
## [1] 0 999
# Split the mortgage data by year
spl <- split(1:nrow(mort), mort[, "year"])
# Use foldable_range() to get the range of the record numbers
foldable_range(Map(function(s) foldable_range(mort[s, "record_number"]), spl))
## [1] 0 999
# Load the iotools and microbenchmark packages
library(iotools)
library(microbenchmark)
# Time the reading of files
microbenchmark(
# Time the reading of a file using read.delim five times
read.delim("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ","),
# Time the reading of a file using read.delim.raw five times
read.delim.raw("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ","),
times = 5
)
## Unit: milliseconds
## expr
## read.delim("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ",")
## read.delim.raw("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ",")
## min lq mean median uq max neval cld
## 483.29446 505.05836 599.8070 586.8768 659.8168 763.9885 5 b
## 86.29924 88.14353 181.2442 100.0256 133.5649 498.1877 5 a
# Read mortgage-sample.csv as a raw vector
raw_file_content <- readAsRaw("./RInputFiles/mortgage-sample.csv")
# Convert the raw vector contents to a matrix
mort_mat <- mstrsplit(raw_file_content, sep = ",", type = "integer", skip = 1)
# Look at the first 6 rows
head(mort_mat)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 566 1 1 3 1 2 4 3 9 2 4 1
## [2,] 1 116 1 3 2 1 2 4 5 9 1 4 1
## [3,] 1 239 1 2 2 3 8 4 5 5 1 2 1
## [4,] 1 62 1 2 3 3 2 4 5 9 2 4 1
## [5,] 1 106 1 2 3 3 2 4 9 9 3 4 1
## [6,] 1 759 1 3 3 2 2 4 9 9 1 2 2
## [,14] [,15] [,16]
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
## [4,] 4 2009 1
## [5,] 4 2013 1
## [6,] 4 2010 1
# Convert the raw file contents to a data.frame
mort_df <- dstrsplit(raw_file_content, sep = ",", col_types = rep("integer", 16), skip = 1)
# Look at the first 6 rows
head(mort_df)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1 1 566 1 1 3 1 2 4 3 9 2 4 1 3 2010 1
## 2 1 116 1 3 2 1 2 4 5 9 1 4 1 3 2008 1
## 3 1 239 1 2 2 3 8 4 5 5 1 2 1 4 2014 0
## 4 1 62 1 2 3 3 2 4 5 9 2 4 1 4 2009 1
## 5 1 106 1 2 3 3 2 4 9 9 3 4 1 4 2013 1
## 6 1 759 1 3 3 2 2 4 9 9 1 2 2 4 2010 1
# We have created a file connection fc to the "mortgage-sample.csv" file and read in the first line to get rid of the header.
# Define the function to apply to each chunk
make_table <- function(chunk) {
# Read each chunk as a matrix
x <- mstrsplit(chunk, type = "integer", sep = ",")
# Create a table of the number of borrowers (column 3) for each chunk
table(x[, 3])
}
# Create a file connection to mortgage-sample.csv
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
# Read the first line to get rid of the header
(col_names <- readLines(fc, n = 1))
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
(col_names <- lapply(str_split(col_names, '\\",\\"'), FUN=function(x) { str_replace(x, '\\"', '') })[[1]])
## [1] "enterprise" "record_number"
## [3] "msa" "perc_minority"
## [5] "tract_income_ratio" "borrower_income_ratio"
## [7] "loan_purpose" "federal_guarantee"
## [9] "borrower_race" "co_borrower_race"
## [11] "borrower_gender" "co_borrower_gender"
## [13] "num_units" "affordability"
## [15] "year" "type"
# Read the data in chunks
counts <- chunk.apply(fc, make_table, CH.MAX.SIZE = 1e5)
# Close the file connection
close(fc)
# Print counts
counts
## 0 1
## [1,] 309 2401
## [2,] 289 2422
## [3,] 266 2444
## [4,] 300 2410
## [5,] 279 2431
## [6,] 310 2400
## [7,] 274 2436
## [8,] 283 2428
## [9,] 259 2452
## [10,] 287 2423
## [11,] 288 2423
## [12,] 283 2428
## [13,] 271 2439
## [14,] 299 2411
## [15,] 294 2416
## [16,] 305 2405
## [17,] 280 2431
## [18,] 275 2435
## [19,] 303 2407
## [20,] 279 2431
## [21,] 296 2414
## [22,] 294 2417
## [23,] 288 2424
## [24,] 264 2446
## [25,] 292 2418
## [26,] 228 2013
# Sum up the chunks
colSums(counts)
## 0 1
## 7395 62605
msa_map <- c("rural", "urban")
# Define the function to apply to each chunk
make_msa_table <- function(chunk) {
# Read each chunk as a data frame
x <- dstrsplit(chunk, col_types = rep("integer", length(col_names)), sep = ",")
# Set the column names of the data frame that's been read
colnames(x) <- col_names
# Create new column, msa_pretty, with a string description of where the borrower lives
x$msa_pretty <- msa_map[x$msa + 1]
# Create a table from the msa_pretty column
table(x$msa_pretty)
}
# Create a file connection to mortgage-sample.csv
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
# Read the first line to get rid of the header
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Read the data in chunks
counts <- chunk.apply(fc, make_msa_table, CH.MAX.SIZE = 1e5)
# Close the file connection
close(fc)
# Aggregate the counts as before
colSums(counts)
## rural urban
## 7395 62605
iotools_read_fun <- function(parallel) {
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
chunk.apply(fc, make_msa_table,
CH.MAX.SIZE = 1e5, parallel = parallel)
close(fc)
}
# Benchmark the new function
microbenchmark(
# Use one process
iotools_read_fun(1),
# Use three processes
iotools_read_fun(3),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max
## iotools_read_fun(1) 102.8291 116.6638 147.6507 124.0177 193.0392 231.5074
## iotools_read_fun(3) 111.2729 126.3639 193.1596 186.3738 223.7989 378.9957
## neval cld
## 20 a
## 20 b
Chapter 4 - Case Study: Preliminary Analysis of Housing Data
Overview of types of analysis for this chapter:
Are the data missing at random?
Analyzing the Housing Data:
Borrower Lending Trends: City vs. Rural:
Wrap up:
Example code includes:
# Create a table of borrower_race column
race_table <- bigtable(mort, "borrower_race")
# Rename the elements
names(race_table) <- race_cat[as.numeric(names(race_table))]
# Find the proportion
race_table[1:7] / sum(race_table[1:7])
## Native Am Asian Black Pacific Is White Two or More
## 0.002330129 0.072315464 0.032915105 0.003177448 0.814828092 0.008603552
## Hispanic
## 0.065830210
mort_names <- col_names
# Create table of the borrower_race
race_table_chunks <- chunk.apply(
"./RInputFiles/mortgage-sample.csv", function(chunk) {
x <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(x) <- mort_names
table(x[, "borrower_race"])
}, CH.MAX.SIZE = 1e5)
# Add up the columns
race_table <- colSums(race_table_chunks)
# Find the proportion
borrower_proportion <- race_table[1:7] / sum(race_table[1:7])
pop_proportion <- c(0.009, 0.048, 0.126, 0.002, 0.724, 0.029, 0.163)
names(pop_proportion) <- race_cat[1:7]
# Create the matrix
matrix(c(pop_proportion, borrower_proportion), byrow = TRUE, nrow = 2,
dimnames = list(c("Population Proportion", "Borrower Proportion"), race_cat[1:7]))
## Native Am Asian Black Pacific Is
## Population Proportion 0.009000000 0.04800000 0.12600000 0.002000000
## Borrower Proportion 0.002330129 0.07231546 0.03291511 0.003177448
## White Two or More Hispanic
## Population Proportion 0.7240000 0.029000000 0.16300000
## Borrower Proportion 0.8148281 0.008603552 0.06583021
# Create a variable indicating if borrower_race is missing in the mortgage data
borrower_race_ind <- mort[, "borrower_race"] == 9
# Create a factor variable indicating the affordability
affordability_factor <- factor(mort[, "affordability"])
# Perform a logistic regression
summary(glm(borrower_race_ind ~ affordability_factor, family = binomial))
## Warning: closing unused connection 5 (./RInputFiles/mortgage-sample.csv)
##
## Call:
## glm(formula = borrower_race_ind ~ affordability_factor, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5969 -0.5016 -0.5016 -0.5016 2.0867
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7478 0.1376 -12.701 <2e-16 ***
## affordability_factor1 -0.2241 0.1536 -1.459 0.1447
## affordability_factor2 -0.3090 0.1609 -1.920 0.0548 .
## affordability_factor3 -0.2094 0.1446 -1.448 0.1476
## affordability_factor4 -0.2619 0.1383 -1.894 0.0582 .
## affordability_factor9 0.1131 0.1413 0.800 0.4235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 52279 on 69999 degrees of freedom
## Residual deviance: 52166 on 69994 degrees of freedom
## AIC: 52178
##
## Number of Fisher Scoring iterations: 4
# Open a connection to the file and skip the header
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Create a function to read chunks
make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("borrower_race", "year"))
}
# Import data using chunk.apply
race_year_table <- chunk.apply(fc, make_table)
# Close connection
close(fc)
# Cast it to a data frame
rydf <- as.data.frame(race_year_table)
# Create a new column Race with race/ethnicity
rydf$Race <- race_cat
# Note: We removed the row corresponding to "Not Avail".
# View rydf
rydf <-
rydf %>%
filter(Race !="Not Avail")
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
# View pop_proportion
pop_proportion
## Native Am Asian Black Pacific Is White Two or More
## 0.009 0.048 0.126 0.002 0.724 0.029
## Hispanic
## 0.163
# Gather on all variables except Race
rydfl <- gather(rydf, Year, Count, -Race)
# Create a new adjusted count variable
rydfl$Adjusted_Count <- rydfl$Count / pop_proportion[rydfl$Race]
# Plot
ggplot(rydfl, aes(x = Year, y = Adjusted_Count, group = Race, color = Race)) +
geom_line()
# View rydf
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
# Normalize the columns
for (i in seq_len(nrow(rydf))) {
rydf[i, 1:8] <- rydf[i, 1:8] / rydf[i, 1]
}
# Convert the data to long format
rydf_long <- gather(rydf, Year, Proportion, -Race)
# Plot
ggplot(rydf_long, aes(x = Year, y = Proportion, group = Race, color = Race)) +
geom_line()
# Open a connection to the file and skip the header
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Create a function to read chunks
make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("msa", "year"))
}
# Import data using chunk.apply
msa_year_table <- chunk.apply(fc, make_table)
# Close connection
close(fc)
# Convert to a data frame
df_msa <- as.data.frame(msa_year_table)
# Rename columns
df_msa$MSA <- c("rural", "city")
# Gather on all columns except Year
df_msa_long <- gather(df_msa, Year, Count, -MSA)
# Plot
ggplot(df_msa_long, aes(x = Year, y = Count, group = MSA, color = MSA)) +
geom_line()
# Tabulate borrower_income_ratio and federal_guarantee
ir_by_fg <- bigtable(mort, c("borrower_income_ratio", "federal_guarantee"))
# Label the columns and rows of the table
income_cat <- c('0 <= 50', '50 < 80', '> 80', 'Not Applicable')
guarantee_cat <- c('FHA/VA', 'RHS', 'HECM', 'No Guarantee')
dimnames(ir_by_fg) <- list(income_cat, guarantee_cat)
# For each row in ir_by_fg, divide by the sum of the row
for (i in seq_len(nrow(ir_by_fg))) {
ir_by_fg[i, ] = ir_by_fg[i, ] / sum(ir_by_fg[i, ])
}
# Print
ir_by_fg
## FHA/VA RHS HECM No Guarantee
## 0 <= 50 0.008944544 0.0014636526 0.0443974630 0.9451943
## 50 < 80 0.005977548 0.0024055985 0.0026971862 0.9889197
## > 80 0.001113022 0.0002428412 0.0006475766 0.9979966
## Not Applicable 0.023676880 0.0013927577 0.0487465181 0.9261838
# Quirky fix so that the files can be used again later
rm(mort)
rm(x)
rm(first_three)
rm(first_three_2)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1886841 100.8 3205452 171.2 3205452 171.2
## Vcells 12926707 98.7 20761330 158.4 17964950 137.1
Chapter 1 - Downloading Files and Using API Clients
Introduction: Working with Web Data in R:
Understanding Application Programming Interfaces (API) - automatically handling data changes:
Access tokens and API:
Example code includes:
# Here are the URLs! As you can see they're just normal strings
csv_url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1561/datasets/chickwts.csv"
tsv_url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_3026/datasets/tsv_data.tsv"
# Read a file in from the CSV URL and assign it to csv_data
csv_data <- read.csv(csv_url)
# Read a file in from the TSV URL and assign it to tsv_data
tsv_data <- read.delim(tsv_url)
# Examine the objects with head()
head(csv_data)
## weight feed
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
head(tsv_data)
## weight feed
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
# Download the file with download.file()
download.file(url = csv_url, destfile = "./RInputFiles/feed_data.csv")
# Read it in with read.csv()
csv_data <- read.csv("./RInputFiles/feed_data.csv")
# Add a new column: square_weight
csv_data$square_weight <- csv_data$weight ** 2
# Save it to disk with saveRDS()
saveRDS(csv_data, "./RInputFiles/modified_feed_data.RDS")
# Read it back in with readRDS()
modified_feed_data <- readRDS("./RInputFiles/modified_feed_data.RDS")
# Examine modified_feed_data
str(modified_feed_data)
## 'data.frame': 71 obs. of 3 variables:
## $ weight : int 179 160 136 227 217 168 108 124 143 140 ...
## $ feed : Factor w/ 6 levels "casein","horsebean",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ square_weight: num 32041 25600 18496 51529 47089 ...
# Load pageviews
# library(pageviews)
# Get the pageviews for "Hadley Wickham"
hadley_pageviews <- pageviews::article_pageviews(project = "en.wikipedia", "Hadley Wickham")
# Examine the resulting object
str(hadley_pageviews)
## 'data.frame': 1 obs. of 8 variables:
## $ project : chr "wikipedia"
## $ language : chr "en"
## $ article : chr "Hadley_Wickham"
## $ access : chr "all-access"
## $ agent : chr "all-agents"
## $ granularity: chr "daily"
## $ date : POSIXct, format: "2015-10-01"
## $ views : num 53
# Load birdnik
# library(birdnik)
# Get the word frequency for "vector", using api_key to access it
# vector_frequency <- word_frequency(api_key, "vector")
Chapter 2 - Using httr to interact with API Directly
GET and POST requests in theory - https and web requests in theory:
Graceful httr - code that responds appropriately and constructs its own url:
Respectful API Usage - usage that works for the API owners as well as the clients:
Example code includes:
# Load the httr package
library(httr)
# Make a GET request to http://httpbin.org/get
get_result <- GET("http://httpbin.org/get")
# Print it to inspect it
# get_result
# Make a POST request to http://httpbin.org/post with the body "this is a test"
# post_result <- POST(url="http://httpbin.org/post", body="this is a test")
# Print it to inspect it
# post_result
url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/Hadley_Wickham/daily/20170101/20170102"
# Make a GET request to url and save the results
pageview_response <- GET(url)
# Call content() to retrieve the data the server sent back
pageview_data <- content(pageview_response)
# Examine the results with str()
str(pageview_data)
## List of 1
## $ items:List of 2
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010100"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 45
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010200"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 86
fake_url <- "http://google.com/fakepagethatdoesnotexist"
# Make the GET request
request_result <- GET(fake_url)
# Check request_result
if(http_error(request_result)){
warning("The request failed")
} else {
content(request_result)
}
## Warning: The request failed
# Construct a directory-based API URL to `http://swapi.co/api`,
# looking for person `1` in `people`
directory_url <- paste("http://swapi.co/api", "people", 1, sep = "/")
# Make a GET call with it
result <- GET(directory_url)
# Create list with nationality and country elements
query_params <- list(nationality = "americans",
country = "antigua")
# Make parameter-based call to httpbin, with query_params
parameter_response <- GET("https://httpbin.org/get", query = query_params)
# Print parameter_response
parameter_response
## Response [https://httpbin.org/get?nationality=americans&country=antigua]
## Date: 2018-02-16 13:02
## Status: 200
## Content-Type: application/json
## Size: 425 B
## {
## "args": {
## "country": "antigua",
## "nationality": "americans"
## },
## "headers": {
## "Accept": "application/json, text/xml, application/xml, */*",
## "Accept-Encoding": "gzip, deflate",
## "Connection": "close",
## "Host": "httpbin.org",
## ...
# Do not change the url
# url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Aaron_Halfaker/daily/2015100100/2015103100"
# Add the email address and the test sentence inside user_agent()
# server_response <- GET(url, user_agent("my@email.address this is a test"))
# Construct a vector of 2 URLs
urls <- c("http://fakeurl.com/api/1.0/", "http://fakeurl.com/api/2.0/")
for(url in urls){
# Send a GET request to url
result <- GET(url)
# Delay for 5 seconds between requests
Sys.sleep(1)
}
get_pageviews <- function(article_title){
url <- paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents", article_title, "daily/2015100100/2015103100", sep = "/")
response <- GET(url, user_agent("my@email.com this is a test"))
if(http_error(response)){
stop("the request failed" )
} else {
result <- content(response)
return(result)
}
}
Chapter 3 - Handling JSON and XML
JSON is a dictionary-like format (plain text) foe sending data on the internet:
Manipulating JSON - lists are the natural R hierarchy for JSON:
XML Structure - plain text like JSON, but with a very different structure:
XPATH - language for specifying nodes in an XML document:
Example code includes:
rev_history <- function(title, format = "json"){
if (title != "Hadley Wickham") {
stop('rev_history() only works for `title = "Hadley Wickham"`')
}
if (format == "json"){
resp <- readRDS("had_rev_json.rds")
} else if (format == "xml"){
resp <- readRDS("had_rev_xml.rds")
} else {
stop('Invalid format supplied, try "json" or "xml"')
}
resp
}
test_json <- "{\"continue\":{\"rvcontinue\":\"20150528042700|664370232\",\"continue\":\"||\"},\"query\":{\"pages\":{\"41916270\":{\"pageid\":41916270,\"ns\":0,\"title\":\"Hadley Wickham\",\"revisions\":[{\"user\":\"214.28.226.251\",\"anon\":\"\",\"timestamp\":\"2015-01-14T17:12:45Z\",\"comment\":\"\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Mary Helen Wickham III''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"73.183.151.193\",\"anon\":\"\",\"timestamp\":\"2015-01-15T15:49:34Z\",\"comment\":\"\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"FeanorStar7\",\"timestamp\":\"2015-01-24T16:34:31Z\",\"comment\":\"/* External links */ add LCCN and cats\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"KasparBot\",\"timestamp\":\"2015-04-26T19:18:17Z\",\"comment\":\"authority control moved to wikidata\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"Spkal\",\"timestamp\":\"2015-05-06T18:24:57Z\",\"comment\":\"/* Bibliography */ Added his new book, R Packages\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"}]}}}}"
# Get revision history for "Hadley Wickham"
resp_json <- rev_history("Hadley Wickham")
# Check http_type() of resp_json
http_type(resp_json)
# Examine returned text with content()
content(resp_json, as="text")
# Parse response with content()
content(resp_json, as="parsed")
# Parse returned text with fromJSON()
library(jsonlite)
fromJSON(content(resp_json, as="text"))
# Load rlist
library(rlist)
# Examine output of this code
str(content(resp_json), max.level = 4)
# Store revision list
revs <- content(resp_json)$query$pages$`41916270`$revisions
# Extract the user element
user_time <- list.select(revs, user, timestamp)
# Print user_time
user_time
# Stack to turn into a data frame
list.stack(user_time)
# Load dplyr
library(dplyr)
# Pull out revision list
revs <- content(resp_json)$query$pages$`41916270`$revisions
# Extract user and timestamp
revs %>%
bind_rows() %>%
select(user, timestamp)
# Load xml2
library(xml2)
# Get XML revision history
resp_xml <- rev_history("Hadley Wickham", format = "xml")
# Check response is XML
http_type(resp_xml)
# Examine returned text with content()
rev_text <- content(resp_xml, as="text")
rev_text
# Turn rev_text into an XML document
rev_xml <- read_xml(rev_text)
# Examine the structure of rev_xml
str(rev_xml)
# Load xml2
library(xml2)
# Get XML revision history
resp_xml <- rev_history("Hadley Wickham", format = "xml")
# Check response is XML
http_type(resp_xml)
# Examine returned text with content()
rev_text <- content(resp_xml, as="text")
rev_text
# Turn rev_text into an XML document
rev_xml <- read_xml(rev_text)
# Examine the structure of rev_xml
xml_structure(rev_xml)
# Find all nodes using XPATH "/api/query/pages/page/revisions/rev"
xml_find_all(rev_xml, "/api/query/pages/page/revisions/rev")
# Find all rev nodes anywhere in document
rev_nodes <- xml_find_all(rev_xml, "//rev")
# Use xml_text() to get text from rev_nodes
xml_text(rev_nodes)
# All rev nodes
rev_nodes <- xml_find_all(rev_xml, "//rev")
# The first rev node
first_rev_node <- xml_find_first(rev_xml, "//rev")
# Find all attributes with xml_attrs()
xml_attrs(first_rev_node)
# Find user attribute with xml_attr()
xml_attr(first_rev_node, attr="user")
# Find user attribute for all rev nodes
xml_attr(rev_nodes, attr="user")
# Find anon attribute for all rev nodes
xml_attr(rev_nodes, attr="anon")
get_revision_history <- function(article_title){
# Get raw revision response
rev_resp <- rev_history(article_title, format = "xml")
# Turn the content() of rev_resp into XML
rev_xml <- read_xml(content(rev_resp, "text"))
# Find revision nodes
rev_nodes <- xml_find_all(rev_xml, "//rev")
# Parse out usernames
user <- xml_attr(rev_nodes, attr="user")
# Parse out timestamps
timestamp <- readr::parse_datetime(xml_attr(rev_nodes, "timestamp"))
# Parse out content
content <- xml_text(rev_nodes)
# Return data frame
data.frame(user = user,
timestamp = timestamp,
content = substr(content, 1, 40))
}
# Call function for "Hadley Wickham"
get_revision_history(article_title = "Hadley Wickham")
Chapter 4 - Web Scraping with XPATH
Web scraping 101 - sometimes a website does not have an API, so a different approach is required:
HTML structure - basically, content within tags, much like XML:
This is a test
requests that “This is a test” be available in paragraph formReformatting data (especially to a rectangular format such as a data frame):
Example code includes:
# Load rvest
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
# Print test_xml
test_xml
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
test_node_xpath <- "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"vcard\", \" \" ))]"
# Use html_node() to grab the node with the XPATH stored as `test_node_xpath`
node <- html_node(x = test_xml, xpath = test_node_xpath)
# Print the first element of the result
node[1]
## $node
## <pointer: 0x000000000b91bb80>
# The first thing we'll grab is a name, from the first element of the previously extracted table (now stored as table_element)
table_element <- node
# Extract the name of table_element
element_name <- html_name(table_element)
# Print the name
element_name
## [1] "table"
second_xpath_val <- "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"fn\", \" \" ))]"
# Extract the element of table_element referred to by second_xpath_val and store it as page_name
page_name <- html_node(x = table_element, xpath = second_xpath_val)
# Extract the text from page_name
page_title <- html_text(page_name)
# Print page_title
page_title
## [1] "Hadley Wickham"
# Turn table_element into a data frame and assign it to wiki_table
wiki_table <- html_table(table_element)
# Print wiki_table
wiki_table
## Hadley Wickham
## 1
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## 11
## Hadley Wickham
## 1
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
## 11
# Rename the columns of wiki_table
colnames(wiki_table) <- c("key", "value")
# Remove the empty row from wiki_table
cleaned_table <- subset(wiki_table, !(key == ""))
# Print cleaned_table
cleaned_table
## key
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
Chapter 5 - CSS Web Scraping and Final Case Study
CSS (cascading style sheets) web scraping in theory:
Final case study: Introduction:
Wrap up:
Example code includes:
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
# Print test_xml
test_xml
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
# Select the table elements
html_nodes(test_xml, css = "table")
## {xml_nodeset (2)}
## [1] <table class="infobox biography vcard" style="width:22em">\n<tr>\n<t ...
## [2] <table class="nowraplinks hlist navbox-inner" style="border-spacing: ...
# Select elements with class = "infobox"
html_nodes(test_xml, css = ".infobox")
## {xml_nodeset (1)}
## [1] <table class="infobox biography vcard" style="width:22em">\n<tr>\n<t ...
# Select elements with id = "firstHeading"
html_nodes(test_xml, css = "#firstHeading")
## {xml_nodeset (1)}
## [1] <h1 id="firstHeading" class="firstHeading" lang="en">Hadley Wickham< ...
# Extract element with class infobox
infobox_element <- html_nodes(test_xml, css = ".infobox")
# Get tag name of infobox_element
element_name <- html_name(infobox_element)
# Print element_name
element_name
## [1] "table"
# Extract element with class fn
page_name <- html_node(x = infobox_element, css=".fn")
# Get contents of page_name
page_title <- html_text(page_name)
# Print page_title
page_title
## [1] "Hadley Wickham"
# Load httr
library(httr)
# The API url
base_url <- "https://en.wikipedia.org/w/api.php"
# Set query parameters
query_params <- list(action="parse",
page="Hadley Wickham",
format="xml")
# Get data from API
resp <- GET(url = "https://en.wikipedia.org/w/api.php", query = query_params)
# Parse response
resp_xml <- content(resp)
# Load rvest
library(rvest)
# Read page contents as HTML
page_html <- read_html(xml_text(resp_xml))
# Extract infobox element
infobox_element <- html_node(page_html, css=".infobox")
# Extract page name element from infobox
page_name <- html_node(infobox_element, css=".fn")
# Extract page name as text
page_title <- html_text(page_name)
# Your code from earlier exercises
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !key == "")
# Create a dataframe for full name
name_df <- data.frame(key = "Full name", value = page_title)
# Combine name_df with cleaned_table
wiki_table2 <- rbind(name_df, cleaned_table)
# Print wiki_table
wiki_table2
## key
## 1 Full name
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 1 Hadley Wickham
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
library(httr)
library(rvest)
library(xml2)
get_infobox <- function(title){
base_url <- "https://en.wikipedia.org/w/api.php"
# Change "Hadley Wickham" to title
query_params <- list(action = "parse",
page = title,
format = "xml")
resp <- GET(url = base_url, query = query_params)
resp_xml <- content(resp)
page_html <- read_html(xml_text(resp_xml))
infobox_element <- html_node(x = page_html, css =".infobox")
page_name <- html_node(x = infobox_element, css = ".fn")
page_title <- html_text(page_name)
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !wiki_table$key == "")
name_df <- data.frame(key = "Full name", value = page_title)
wiki_table <- rbind(name_df, cleaned_table)
wiki_table
}
# Test get_infobox with "Hadley Wickham"
get_infobox(title = "Hadley Wickham")
## key
## 1 Full name
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 1 Hadley Wickham
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
# Try get_infobox with "Ross Ihaka"
get_infobox(title = "Ross Ihaka")
## key
## 1 Full name
## 2 Ihaka at the 2010 New Zealand Open Source Awards
## 3 Residence
## 4 Alma mater
## 5 Known for
## 6 Awards
## 7 Scientific career
## 8 Fields
## 9 Institutions
## 10 Thesis
## 11 Doctoral advisor
## value
## 1 Ross Ihaka
## 2 Ihaka at the 2010 New Zealand Open Source Awards
## 3 Auckland, New Zealand
## 4 University of AucklandUniversity of California, Berkeley
## 5 R programming language
## 6 Pickering Medal (2008)
## 7 Scientific career
## 8 Statistical Computing
## 9 University of Auckland
## 10 Ruaumoko (1985)
## 11 David R. Brillinger
# Try get_infobox with "Grace Hopper"
get_infobox(title = "Grace Hopper")
## key
## 1 Full name
## 2 Rear Admiral Grace M. Hopper, 1984
## 3 Nickname(s)
## 4 Born
## 5 Died
## 6 Place of burial
## 7 Allegiance
## 8 Service/branch
## 9 Years of service
## 10 Rank
## 11 Awards
## value
## 1 Grace Murray Hopper
## 2 Rear Admiral Grace M. Hopper, 1984
## 3 "Amazing Grace"
## 4 (1906-12-09)December 9, 1906\nNew York City, New York, U.S.
## 5 January 1, 1992(1992-01-01) (aged 85)Arlington, Virginia, U.S.
## 6 Arlington National Cemetery
## 7 United States of America
## 8 United States Navy
## 9 1943–1966, 1967–1971, 1972–1986
## 10 Rear admiral (lower half)
## 11 Defense Distinguished Service Medal Legion of Merit Meritorious Service Medal American Campaign Medal World War II Victory Medal National Defense Service Medal Armed Forces Reserve Medal with two Hourglass Devices Naval Reserve Medal Presidential Medal of Freedom (posthumous)
Chapter 1 - Basic plotting with lattice
Introduction - general objectives:
Optional arguments:
Box and whisker plots and reordering elements:
Example code includes:
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Load the lattice package
library(lattice)
# Create the histogram
histogram(~ Ozone, data = airquality)
# Create the histogram
histogram(~ Ozone, data = airquality,
# Specify number of bins
nint = 15,
# Specify quantity displayed on y-axis
type = "count")
# Create the scatter plot
xyplot(Ozone ~ Solar.R, data = airquality)
# Create scatterplot
xyplot(Ozone ~ Temp, data = airquality,
# Add main label
main = "Environmental conditions in New York City (1973)",
# Add axis labels
ylab = "Ozone (ppb)",
xlab = "Temperature (Fahrenheit)")
# Create a density plot
densityplot(~ Ozone, data = airquality,
# Choose how raw data is shown
plot.points = "jitter")
data(USCancerRates, package="latticeExtra")
str(USCancerRates)
## 'data.frame': 3041 obs. of 8 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female: num 124 103 161 157 151 ...
## $ UCL95.female: num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
rn_USCR <- row.names(USCancerRates)
# Create reordered variable
library(dplyr)
USCancerRates <-
mutate(USCancerRates,
state.ordered = reorder(state, rate.female, median, na.rm = TRUE)
)
# Create box and whisker plot
bwplot(state.ordered ~ rate.female, data = USCancerRates)
# Create box and whisker plot
bwplot(state.ordered ~ rate.female, data = USCancerRates,
# Change whiskers extent
coef = 0)
Chapter 2 - Conditioning and the Formula Interface
Conditioning - identify sources of variability in the data by examining sub-groups:
Data summary and transformation - grouping:
Incorporating external data sources:
The trellis object - lattice creates trellis objects rather than directly creating plots (as in base R):
Example code includes:
# The airquality dataset has been pre-loaded
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Create a histogram
histogram(~ Ozone | factor(Month),
data = airquality,
# Define the layout
layout=c(2, 3),
# Change the x-axis label
xlab="Ozone (ppb)")
# USCancerRates has been pre-loaded
str(USCancerRates)
## 'data.frame': 3041 obs. of 9 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered: Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
# Create a density plot
densityplot(~ rate.male + rate.female,
data = USCancerRates,
outer = TRUE,
# Suppress data points
plot.points = FALSE,
# Add a reference line
ref=TRUE)
# Create a density plot
densityplot(~ rate.male + rate.female,
data = USCancerRates,
# Set value of 'outer'
outer=FALSE,
# Add x-axis label
xlab="Rate (per 100,000)",
# Add a legend
auto.key=TRUE,
plot.points = FALSE,
ref = TRUE)
xyplot(Ozone ~ Temp, airquality, groups = Month,
# Complete the legend spec
auto.key = list(space = "right",
title = "Month",
text = month.name[5:9]))
USCancerRates <- USCancerRates %>%
mutate(division=state.division[match(state, state.name)])
# Create 'division.ordered' by reordering levels
USCancerRates <-
mutate(USCancerRates,
division.ordered = reorder(division,
rate.male + rate.female,
mean, na.rm = TRUE))
# Create conditioned scatter plot
xyplot(rate.female ~ rate.male | division.ordered,
data = USCancerRates,
# Add reference grid
grid = TRUE,
# Add reference line
abline = c(0, 1))
# Levels of division.ordered
levels(USCancerRates$division.ordered)
## [1] "Mountain" "West North Central" "Pacific"
## [4] "Middle Atlantic" "New England" "East North Central"
## [7] "West South Central" "South Atlantic" "East South Central"
# Specify the as.table argument
xyplot(rate.female ~ rate.male | division.ordered,
data = USCancerRates,
grid = TRUE, abline = c(0, 1),
as.table=TRUE)
# Create box-and-whisker plot
bwplot(division.ordered ~ rate.male + rate.female,
data = USCancerRates,
outer = TRUE,
# Add a label for the x-axis
xlab="Rate (per 100,000)",
# Add strip labels
strip = strip.custom(factor.levels = c("Male", "Female")))
# Create "trellis" object
tplot <-
densityplot(~ rate.male + rate.female | division.ordered,
data = USCancerRates, outer = TRUE,
plot.points = FALSE, as.table = TRUE)
# Change names for the second dimension
dimnames(tplot)[[2]] <- c("Male", "Female")
# Update x-axis label and plot
update(tplot, xlab = "Rate")
# Create "trellis" object
tplot <-
densityplot(~ rate.male + rate.female | division.ordered,
data = USCancerRates, outer = TRUE,
plot.points = FALSE, as.table = TRUE)
# Inspect dimension
dim(tplot)
## [1] 9 2
dimnames(tplot)
## $division.ordered
## [1] "Mountain" "West North Central" "Pacific"
## [4] "Middle Atlantic" "New England" "East North Central"
## [7] "West South Central" "South Atlantic" "East South Central"
##
## [[2]]
## [1] "rate.male" "rate.female"
# Select subset retaining only last three divisions
tplot[7:9, ]
Chapter 3 - Controlling scales and graphical parameters
Combining scales:
Logarithmic scales:
Graphical parameters:
Using simpleTheme():
Example code includes:
# The lattice package and the USMortality dataset have been pre-loaded.
Status <- factor(c('Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural'), levels=c("Rural", "Urban")
)
Sex <- factor(c('Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female'), levels=c("Female", "Male")
)
Cause <- factor(c('Heart disease', 'Heart disease', 'Heart disease', 'Heart disease', 'Cancer', 'Cancer', 'Cancer', 'Cancer', 'Lower respiratory', 'Lower respiratory', 'Lower respiratory', 'Lower respiratory', 'Unintentional injuries', 'Unintentional injuries', 'Unintentional injuries', 'Unintentional injuries', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Alzheimers', 'Alzheimers', 'Alzheimers', 'Alzheimers', 'Diabetes', 'Diabetes', 'Diabetes', 'Diabetes', 'Flu and pneumonia', 'Flu and pneumonia', 'Flu and pneumonia', 'Flu and pneumonia', 'Suicide', 'Suicide', 'Suicide', 'Suicide', 'Nephritis', 'Nephritis', 'Nephritis', 'Nephritis'),
levels=c('Alzheimers', 'Cancer', 'Cerebrovascular diseases', 'Diabetes', 'Flu and pneumonia', 'Heart disease', 'Lower respiratory', 'Nephritis', 'Suicide', 'Unintentional injuries')
)
Rate <- c(210.2, 242.7, 132.5, 154.9, 195.9, 219.3, 140.2, 150.8, 44.5, 62.8, 36.5, 46.9, 49.6, 71.3, 24.7, 37.2, 36.1, 42.2, 34.9, 42.2, 19.4, 21.8, 25.5, 30.6, 24.9, 29.5, 17.1, 21.8, 17.7, 20.8, 12.9, 16.3, 19.2, 26.3, 5.3, 6.2, 15.7, 18.3, 10.7, 13.9)
SE <- c(0.2, 0.6, 0.2, 0.4, 0.2, 0.5, 0.2, 0.4, 0.1, 0.3, 0.1, 0.2, 0.1, 0.3, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.1, 0.1, 0.2, 0, 0.1, 0.1, 0.2, 0, 0.1)
USMortality <- data.frame(Status=Status, Sex=Sex, Cause=Cause, Rate=Rate, SE=SE)
# Specify upper bound to exclude Heart disease and Cancer
x_limits <- c(0, 100)
# Draw the plot
dotplot(Cause ~ Rate | Sex + Status, data = USMortality, as.table = TRUE,
xlim = x_limits)
dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = TRUE,
scales = list(x = list(relation = "free",
# Specify limits for each panel
limits = list(c(0, 50), c(0, 80),
c(0, 50), c(0, 80) ))))
dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = TRUE,
# Change the number of tick marks
scales = list(x = list(tick.number = 10,
# Show `Rate` labels on both bottom and top
alternating = 3,
# Rotate `Rate` labels by 90 degrees
rot = 90),
# Rotate `Cause` labels by 45 degrees
y = list(rot = 45)))
# Define at as 2^3 up to 2^8
x_ticks_at <- 2 ** (3:8)
dotplot(Cause ~ Rate | Sex, data = USMortality,
groups = Status, auto.key = list(columns = 2),
scales = list(x = list(log = 2,
# A numeric vector with
# values 2^3, 2^4, ..., 2^8
at = x_ticks_at,
# A character vector,
# "8" for 2^3, "16" for 2^4, etc.
labels = x_ticks_at)))
# Create the dot plot
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
scales = list(x = list(log = TRUE,
equispaced.log = FALSE)),
# Provide pch values for the two groups
pch = c(3, 1))
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
par.settings = simpleTheme(pch = c(3, 1)),
scales = list(x = list(log = 2, equispaced.log = FALSE)))
# The WorldPhones matrix is already provided, with the first row removed so you only need consider consecutive years
data(WorldPhones)
WorldPhones <- WorldPhones[row.names(WorldPhones) != 1951, ]
WorldPhones
## N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
## 1956 60423 29990 4708 2568 2366 1411 733
## 1957 64721 32510 5230 2695 2526 1546 773
## 1958 68484 35218 6662 2845 2691 1663 836
## 1959 71799 37598 6856 3000 2868 1769 911
## 1960 76036 40341 8220 3145 3054 1905 1008
## 1961 79831 43173 9053 3338 3224 2005 1076
names(dimnames(WorldPhones)) <- c("Year", "Region")
# Transform matrix data to data frame
WorldPhonesDF <- as.data.frame(
# Intermediate step: convert to table
as.table(WorldPhones),
responseName = "Phones")
# Create the dot plot
dotplot(Year ~ Phones | Region,
data = WorldPhonesDF,
as.table = TRUE,
# Log-transform the x-axis
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
# Set x-axis relation to "sliced"
relation = "sliced")))
# Load latticeExtra package for ggplot2like()
library(latticeExtra)
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked _by_ '.GlobalEnv':
##
## USCancerRates
## The following object is masked from 'package:ggplot2':
##
## layer
# Transform matrix data to data frame
names(dimnames(WorldPhones)) <- c("Year", "Region")
WorldPhonesDF <-
as.data.frame(as.table(WorldPhones[-1, ]),
responseName = "Phones")
# Create the dot plot
dotplot(Year ~ Phones | Region,
data = WorldPhonesDF,
as.table = TRUE,
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
relation = "sliced")),
# Fill in suitable value of par.settings
par.settings = ggplot2like(),
# Fill in suitable value of lattice.options
lattice.options = ggplot2like.opts())
# Create factor variable
airquality$Month.Name <-
factor(airquality$Month, levels = 1:12,
labels = month.name[1:12])
# Create histogram of Ozone, conditioning on Month
histogram(~ Ozone | Month.Name,
data = airquality, as.table = TRUE,
# Set border to be transparent
border = "transparent",
# Set fill color to be mid-gray
col = "grey50")
# Create factor variable
airquality$Month.Name <-
factor(airquality$Month, levels = 1:12,
labels = month.name)
levels(airquality$Month.Name)
## [1] "January" "February" "March" "April" "May"
## [6] "June" "July" "August" "September" "October"
## [11] "November" "December"
# Drop empty levels
airquality$Month.Name <- droplevels(airquality$Month.Name)
levels(airquality$Month.Name)
## [1] "May" "June" "July" "August" "September"
# Obtain colors from RColorBrewer
library(RColorBrewer)
my.colors <- brewer.pal(n = 5, name = "Set1")
# Density plot of ozone concentration grouped by month
densityplot(~ Ozone, data = airquality, groups = Month.Name,
plot.points = FALSE,
auto.key = list(space = "right"),
# Fill in value of col
par.settings = simpleTheme(col = my.colors,
# Fill in value of lwd
lwd = 2))
Chapter 4 - Customizing plots using panel functions
Panel functions:
Prepanel Functions to control limits:
Optional arguments of default panel functions:
Example code includes:
panel.xyrug <- function(x, y, ...)
{
# Reproduce standard scatter plot
panel.xyplot(x, y, ...)
# Identify observations with x-value missing
x.missing <- is.na(x)
# Identify observations with y-value missing
y.missing <- is.na(y)
# Draw rugs along axes
panel.rug(x = x[y.missing], y = y[x.missing])
}
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
xyplot(Ozone ~ Solar.R | Month.Name, data = airquality,
panel = panel.xyrug, as.table = TRUE)
# Create factor variable with month names
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
# Create box-and-whisker plot
bwplot(Month.Name ~ Ozone + Temp, airquality,
# Specify outer
outer=TRUE,
# Specify x-axis relation
scales = list(x = list(relation="free")),
# Specify layout
layout=c(2, 1),
# Specify x-axis label
xlab="Measured value")
# Create violin plot
bwplot(Month.Name ~ Ozone + Temp, airquality,
# Specify outer
outer = TRUE,
# Specify x-axis relation
scales = list(x = list(relation="free")),
# Specify layout
layout=c(2, 1),
# Specify x-axis label
xlab="Measured value",
# Replace default panel function
panel = panel.violin)
# Create panel function
panel.ss <- function(x, y, ...) {
# Call panel.smoothScatter()
panel.smoothScatter(x, y, ...)
# Call panel.loess()
panel.loess(x, y, col = "red")
# Call panel.abline()
panel.abline(0, 1)
}
# Create plot
xyplot(rate.female ~ rate.male, data = USCancerRates,
panel = panel.ss,
main = "County-wise deaths due to cancer")
## (loaded the KernSmooth namespace)
# Define prepanel function
prepanel.histdens.2 <- function(x, ...) {
h <- prepanel.default.histogram(x, ...)
d <- density(x, na.rm = TRUE)
list(xlim = quantile(x, c(0.005, 0.995), na.rm = TRUE),
# Calculate upper y-limit
ylim = c(0, max(d$y, h$ylim[2])))
}
panel.histdens <- function(x, ...) {
panel.histogram(x, ...)
panel.lines(density(x, na.rm = TRUE))
}
# Create a histogram of rate.male and rate.female
histogram(~ rate.male + rate.female,
data = USCancerRates, outer = TRUE,
type = "density", nint = 50,
border = "transparent", col = "lightblue",
# The panel function: panel.histdens
panel = panel.histdens,
# The prepanel function: prepanel.histdens.2
prepanel = prepanel.histdens.2,
# Ensure that the x-axis is log-transformed
# and has relation "sliced"
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
relation = "sliced")),
xlab = "Rate (per 100,000)")
# Create the box and whisker plot
bwplot(division.ordered ~ rate.male,
data = USCancerRates,
# Indicate median by line instead of dot
pch = "|",
# Include notches for confidence interval
notch = TRUE,
# The x-axis should plot log-transformed values
scales = list(x = list(log=TRUE, equispaced.log=FALSE)),
xlab = "Death Rate in Males (per 100,000)")
# Load the 'latticeExtra' package
library(latticeExtra)
# Create summary dataset
USCancerRates.state <-
with(USCancerRates, {
rmale <- tapply(rate.male, state, median, na.rm = TRUE)
rfemale <- tapply(rate.female, state, median, na.rm = TRUE)
data.frame(Rate = c(rmale, rfemale),
State = rep(names(rmale), 2),
Gender = rep(c("Male", "Female"),
each = length(rmale)))
})
# Reorder levels
library(dplyr)
USCancerRates.state <-
mutate(USCancerRates.state, State = reorder(State, Rate))
head(USCancerRates.state)
## Rate State Gender
## 1 286.00 Alabama Male
## 2 237.95 Alaska Male
## 3 209.30 Arizona Male
## 4 284.10 Arkansas Male
## 5 221.30 California Male
## 6 204.40 Colorado Male
# URLs for emojis
emoji.man <- "https://twemoji.maxcdn.com/72x72/1f468.png"
emoji.woman <- "https://twemoji.maxcdn.com/72x72/1f469.png"
# Create dotplot
# dotplot(State ~ Rate, data = USCancerRates.state,
# Specify grouping variable
# groups = Gender,
# Specify panel function
# panel = panel.xyimage,
# Specify emoji URLs
# pch = c(emoji.woman, emoji.man),
# Make symbols smaller
# cex = 0.75)
Chapter 5 - Extensions and the lattice ecosystem
New methods - lattice is used by many packages because it is highly extensible:
New high-level functions can be created:
Manipulation (extension) of trellis objects:
Example code includes:
# Use 'EuStockMarkets' time series data
data(EuStockMarkets)
str(EuStockMarkets)
## Time-Series [1:1860, 1:4] from 1991 to 1999: 1629 1614 1607 1621 1618 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "DAX" "SMI" "CAC" "FTSE"
# Create time series plot
xyplot(EuStockMarkets,
# Plot all series together
superpose = TRUE,
# Split up the time axis into parts
cut = list(number = 3, overlap = 0.25))
# Create time series plot
xyplot(EuStockMarkets,
# Specify panel function
panel=panel.horizonplot,
# Specify prepanel function
prepanel=prepanel.horizonplot)
# Load required packages
library(maps)
# Create map object for US counties
county.map <- map("county", plot = FALSE, fill = TRUE,
# Specify projection
projection = "sinusoidal")
# Create choropleth map
row.names(USCancerRates) <- rn_USCR
mapplot(row.names(USCancerRates) ~ log10(rate.male) + log10(rate.female),
data = USCancerRates,
xlab = "", scales = list(draw = FALSE),
# Specify map
map = county.map)
## Warning in (function (x, y, map, breaks, colramp, exact = FALSE,
## lwd = 0.5, : 64 unmatched regions: alaska,nome, alaska,wade hampton,
## alaska,haines, alaska,....
## Warning in (function (x, y, map, breaks, colramp, exact = FALSE,
## lwd = 0.5, : 64 unmatched regions: alaska,nome, alaska,wade hampton,
## alaska,haines, alaska,....
# Create subset for Louisiana
LACancerRates1 <- filter(USCancerRates, state == "Louisiana")
str(LACancerRates1)
## 'data.frame': 64 obs. of 11 variables:
## $ rate.male : num 369 361 349 338 338 ...
## $ LCL95.male : num 316 289 302 308 303 ...
## $ UCL95.male : num 428 446 402 372 376 ...
## $ rate.female : num 162 193 215 194 192 ...
## $ LCL95.female : num 134 150 184 176 170 ...
## $ UCL95.female : num 196 246 250 215 218 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ county :Class 'AsIs' chr [1:64] "Richland Parish" "Madison Parish" "De Soto Parish" "St. Bernard Parish" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 46 46 46 46 46 46 46 46 46 46 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 7 7 7 7 7 7 7 7 7 7 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Reorder levels of county
LACancerRates2 <-
mutate(LACancerRates1,
county = reorder(county, rate.male))
# Draw confidence intervals
segplot(county ~ LCL95.male + UCL95.male,
data = LACancerRates2,
# Add point estimates
centers = rate.male,
# Draw segments rather than bands
draw.bands = FALSE)
# The 'USCancerRates' dataset
str(USCancerRates)
## 'data.frame': 3041 obs. of 11 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 9 9 9 9 9 9 9 9 9 9 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Load the 'hexbin' package
library(hexbin)
# Create hexbin plot
hexbinplot(rate.female ~ rate.male,
data = USCancerRates,
# Add a regression line
type = "r",
# function to transform counts
trans = sqrt,
# function to invert transformed counts
inv = function(x) x^2
)
# Load the 'directlabels' package
library(directlabels)
# Use the 'airquality' dataset
str(airquality)
## 'data.frame': 153 obs. of 7 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R : int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Month.Name: Factor w/ 12 levels "January","February",..: 5 5 5 5 5 5 5 5 5 5 ...
# Create factor variable
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
# Create density plot object
tplot2 <-
densityplot(~ Ozone + Temp, data = airquality,
# Variables should go in different panels
outer = TRUE,
# Specify grouping variable
groups = Month.Name,
# Suppress display of data points
plot.points = FALSE,
# Add reference line
ref = TRUE,
# Specify layout
layout = c(2, 1),
# Omit strip labels
strip = FALSE,
# Provide column-specific x-axis labels
xlab = c("Ozone (ppb)", "Temperature (F)"),
# Let panels have independent scales
scales = list(relation="free"))
# Produce plot with direct labels
direct.label(tplot2)
# 'USCancerRates' is pre-loaded
str(USCancerRates)
## 'data.frame': 3041 obs. of 11 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 9 9 9 9 9 9 9 9 9 9 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Create scatter plot
p <- xyplot(rate.female ~ rate.male, data = USCancerRates,
# Change plotting character
pch = 16,
# Make points semi-transparent
alpha = 0.25)
# Create layer with reference grid
l0 <- layer_(panel.grid())
# Create layer with reference line
l1 <- layer(panel.abline(0, 1))
# Create layer with regression fit
l2 <- layer(panel.smoother(x, y, method="lm"))
# Combine and plot
p + l0 + l1 + l2
Chapter 1 - R Time Series Visualization Tools
Refresher on xts and the plot() function:
Other useful visualizing functions:
Example code includes:
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
# data is a 504x4 xts object of Yahoo, Microsoft, Citigroup, and Dow
tmpData <- readr::read_delim("./RInputFiles/dataset_1_1.csv", delim=" ")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## yahoo = col_double(),
## microsoft = col_double(),
## citigroup = col_double(),
## dow_chemical = col_double()
## )
data <- xts::xts(tmpData[, -1], order.by=as.POSIXct(tmpData$Index))
# Display the first few lines of the data
head(data)
## yahoo microsoft citigroup dow_chemical
## 2015-01-01 18:00:00 50.17 44.30501 53.45259 42.48209
## 2015-01-04 18:00:00 49.13 43.89759 51.76803 41.16821
## 2015-01-05 18:00:00 49.21 43.25329 49.94556 40.50662
## 2015-01-06 18:00:00 48.59 43.80284 50.40857 40.44139
## 2015-01-07 18:00:00 50.23 45.09144 51.16711 41.44776
## 2015-01-08 18:00:00 49.72 44.71244 50.02437 41.38253
# Display the column names of the data
colnames(data)
## [1] "yahoo" "microsoft" "citigroup" "dow_chemical"
# Plot yahoo data and add title
plot(data[, "yahoo"], main="yahoo")
# Replot yahoo data with labels for X and Y axes
plot(data[, "yahoo"], main="yahoo", xlab="date", ylab="price")
# Note that type="h" is for bars
# Plot the second time series and change title
plot(data[, 2], main="microsoft")
# Replot with same title, add subtitle, use bars
plot(data[, 2], main="microsoft", sub="Daily closing price since 2015", type="h")
# Change line color to red
lines(data[, 2], col="red")
# Plot two charts on same graphical window
par(mfrow = c(2, 1))
plot(data[, 1], main="yahoo")
plot(data[, 2], main="microsoft")
# Replot with reduced margin and character sizes
par(mfrow = c(2, 1), mex=0.6, cex=0.8)
plot(data[, 1], main="yahoo")
plot(data[, 2], main="microsoft")
par(mfrow = c(1, 1), mex=1, cex=1)
# Plot the "microsoft" series
plot(data[, "microsoft"], main="Stock prices since 2015")
# Add the "dow_chemical" series in red
lines(data[, "dow_chemical"], col="red")
# Add a Y axis on the right side of the chart
axis(side=4, at=pretty(data[, "dow_chemical"]))
# Add a legend in the bottom right corner
legend("bottomright", legend=c("microsoft", "dow_chemical"), col=c("black", "red"), lty=c(1, 1))
# Plot the "citigroup" time series
plot(data[, "citigroup"], main="Citigroup")
# Create vert_line to identify January 4th, 2016 in citigroup
vert_line <- which(index(data[, "citigroup"]) == as.POSIXct("2016-01-04"))
# Add a red vertical line using vert_line
abline(v = .index(data[, "citigroup"])[vert_line], col = "red")
# Create hori_line to identify average price of citigroup
hori_line <- mean(data[, "citigroup"])
# Add a blue horizontal line using hori_line
abline(h = hori_line, col = "blue")
# Create period to hold the 3 months of 2015
period <- c("2015-01/2015-03")
# Highlight the first three months of 2015
PerformanceAnalytics::chart.TimeSeries(data[, "citigroup"], period.areas=period)
# Highlight the first three months of 2015 in light grey
PerformanceAnalytics::chart.TimeSeries(data[, "citigroup"], period.areas=period, period.color="lightgrey")
# Plot the microsoft series
plot(data[, "microsoft"], main="Dividend date and amount")
# Add the citigroup series
lines(data[, "citigroup"], col="orange", lwd=2)
# Add a new y axis for the citigroup series
axis(side=4, at=pretty(data[, "citigroup"]), col="orange")
micro_div_date <- "15 Nov. 2016"
citi_div_date <- "13 Nov. 2016"
micro_div_value <- "$0.39"
citi_div_value <- "$0.16"
# Same plot as the previous exercise
plot(data$microsoft, main = "Dividend date and amount")
lines(data$citigroup, col = "orange", lwd = 2)
axis(side = 4, at = pretty(data$citigroup), col = "orange")
# Create the two legend strings
micro <- paste0("Microsoft div. of ", micro_div_value," on ", micro_div_date)
citi <- paste0("Citigroup div. of ", citi_div_value," on ", citi_div_date)
# Create the legend in the bottom right corner
legend(x = "bottomright", legend = c(micro, citi), col = c("black", "orange"), lty = c(1, 1))
data_1_1_old <- data
Chapter 2 - Univariate Time Series
Univariate time series analysis - deals with only a single variable:
Other visualization tools:
Combining everything so far:
Example code includes:
tmpData <- readr::read_delim("./RInputFiles/dataset_2_1.csv", delim=" ")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## Apple = col_double()
## )
names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot Apple's stock price
plot(data[, "apple"], main="Apple stock price")
# Create a time series called rtn
rtn <- TTR::ROC(data[, "apple"])
# Plot Apple daily price and daily returns
par(mfrow=c(1, 2))
plot(data[, "apple"], main="Apple stock price")
plot(rtn)
par(mfrow=c(1, 1))
dim(rtn)
## [1] 522 1
rtn <- rtn[complete.cases(rtn), ]
dim(rtn)
## [1] 521 1
# Create a histogram of Apple stock returns
hist(rtn, main="Apple stock return distribution", probability=TRUE)
# Add a density line
lines(density(rtn[complete.cases(rtn), ]))
# Redraw a thicker, red density line
lines(density(rtn[complete.cases(rtn), ]), col="red", lwd=2)
rtnRaw <- as.double(rtn$apple)
# Draw box and whisker plot for the Apple returns
boxplot(rtnRaw)
# Draw a box and whisker plot of a normal distribution
boxplot(rnorm(1000))
# Redraw both plots on the same graphical window
par(mfrow=c(2, 1))
boxplot(rtnRaw, horizontal=TRUE)
boxplot(rnorm(1000), horizontal=TRUE)
par(mfrow=c(1, 1))
# Draw autocorrelation plot
acf(rtn, main="Apple return autocorrelation")
# Redraw with a maximum lag of 10
acf(rtn, main="Apple return autocorrelation", lag.max=10)
# Create q-q plot
qqnorm(rtn, main="Apple return QQ-plot")
# Add a red line showing normality
qqline(rtn, col="red")
par(mfrow=c(2, 2))
hist(rtn, probability=TRUE)
lines(density(rtn), col="red")
boxplot(rtnRaw)
acf(rtn)
qqnorm(rtn)
qqline(rtn, col="red")
par(mfrow=c(1, 1))
Chapter 3 - Multivariate Time Series
Dealing with higher dimensions - visualization challenges with larger numbers of series:
Multivariate time series:
Higher dimension time series:
Example code includes:
# You are provided with a dataset (portfolio) containing the weigths of stocks A (stocka) and B (stockb) in your portfolio for each month in 2016
stockA <- c(0.1, 0.4, 0.5, 0.5, 0.2, 0.3, 0.7, 0.8, 0.7, 0.2, 0.1, 0.2)
stockB <- c(0.9, 0.6, 0.5, 0.5, 0.8, 0.7, 0.3, 0.2, 0.3, 0.8, 0.9, 0.8)
pDates <- as.Date(c('2016-01-01', '2016-02-01', '2016-03-01', '2016-04-01', '2016-05-01', '2016-06-01', '2016-07-01', '2016-08-01', '2016-09-01', '2016-10-01', '2016-11-01', '2016-12-01'))
portfolio <- xts(data.frame(stocka=stockA, stockb=stockB), order.by=pDates)
# Plot stacked barplot
barplot(portfolio)
# Plot grouped barplot
barplot(portfolio, beside=TRUE)
tmpData <- readr::read_delim("./RInputFiles/data_3_2.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## sp500 = col_double(),
## citigroup = col_double(),
## microsoft = col_double(),
## apple = col_double(),
## dowchemical = col_double(),
## yahoo = col_double()
## )
# names(tmpData) <- c("Index", "apple")
my_data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
citi <- as.numeric(my_data$citigroup)
sp500 <- as.numeric(my_data$sp500)
# Draw the scatterplot
plot(y=citi, x=sp500)
# Draw a regression line
abline(reg=lm(citi ~ sp500), col="red", lwd=2)
# my_data containing the returns for 5 stocks: ExxonMobile, Citigroup, Microsoft, Dow Chemical and Yahoo
# Create correlation matrix using Pearson method
cor(my_data)
## sp500 citigroup microsoft apple dowchemical yahoo
## sp500 1.0000000 0.5097953 0.3743215 0.3576966 0.5217243 0.2900962
## citigroup 0.5097953 1.0000000 0.4841408 0.4291841 0.5085190 0.4029490
## microsoft 0.3743215 0.4841408 1.0000000 0.5133469 0.3954523 0.4329388
## apple 0.3576966 0.4291841 0.5133469 1.0000000 0.3627755 0.3413626
## dowchemical 0.5217243 0.5085190 0.3954523 0.3627755 1.0000000 0.2938749
## yahoo 0.2900962 0.4029490 0.4329388 0.3413626 0.2938749 1.0000000
# Create correlation matrix using Spearman method
cor(my_data, method="spearman")
## sp500 citigroup microsoft apple dowchemical yahoo
## sp500 1.0000000 0.5192579 0.4244237 0.3518853 0.5316235 0.3262037
## citigroup 0.5192579 1.0000000 0.4976477 0.4374850 0.5607511 0.3780730
## microsoft 0.4244237 0.4976477 1.0000000 0.5128477 0.4684114 0.4448179
## apple 0.3518853 0.4374850 0.5128477 1.0000000 0.3681791 0.3680715
## dowchemical 0.5316235 0.5607511 0.4684114 0.3681791 1.0000000 0.3464743
## yahoo 0.3262037 0.3780730 0.4448179 0.3680715 0.3464743 1.0000000
# Create scatterplot matrix
pairs(as.data.frame(my_data))
# Create upper panel scatterplot matrix
pairs(as.data.frame(my_data), lower.panel=NULL)
cor_mat <- cor(my_data)
# In this exercise, you will use the provided correlation matrix cor_mat
# Create correlation matrix
corrplot::corrplot(cor_mat)
# Create correlation matrix with numbers
corrplot::corrplot(cor_mat, method="number")
# Create correlation matrix with colors
corrplot::corrplot(cor_mat, method="color")
# Create upper triangle correlation matrix
corrplot::corrplot(cor_mat, method="number", type="upper")
# Draw heatmap of cor_mat
corrplot::corrplot(cor_mat, method="color")
# Draw upper heatmap
corrplot::corrplot(cor_mat, method="color", type="upper")
# Draw the upper heatmap with hclust
corrplot::corrplot(cor_mat, method="color", type="upper", order="hclust")
Chapter 4 - Case Study: Stock Picking for Portfolios
Case study presentation:
New stocks:
Course conclusion:
Example code includes:
# In this exercise, you are provided with a dataset data containing the value and the return of the portfolio over time, in value and return, respectively.
tmpData <- readr::read_delim("./RInputFiles/data_4_1.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## value = col_double(),
## return = col_double()
## )
# names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot the portfolio value
plot(data$value, main="Portfolio Value")
# Plot the portfolio return
plot(data$return, main="Portfolio Return")
# Plot a histogram of portfolio return
hist(data$return, probability=TRUE)
# Add a density line
lines(density(data$return), col="red", lwd=2)
tmpPortfolioData <- data
# The new dataset data containing four new stocks is available in your workspace: Goldman Sachs (GS), Coca-Cola (KO), Walt Disney (DIS), Caterpillar (CAT)
tmpData <- readr::read_delim("./RInputFiles/data_4_3.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## GS = col_double(),
## KO = col_double(),
## DIS = col_double(),
## CAT = col_double()
## )
# names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot the four stocks on the same graphical window
par(mfrow=c(2, 2), mex=0.8, cex=0.8)
plot(data[, 1])
plot(data[, 2])
plot(data[, 3])
plot(data[, 4])
par(mfrow=c(1, 1), mex=1, cex=1)
# In this exercise, you are provided with four individual series containing the return of the same four stocks:
# gs, ko, dis, cat
# Solution makes absolutely no sense
portfolio <- as.numeric(tmpPortfolioData$return)
gs <- as.numeric(TTR::ROC(data[, "GS"]))[-1]
ko <- as.numeric(TTR::ROC(data[, "KO"]))[-1]
dis <- as.numeric(TTR::ROC(data[, "DIS"]))[-1]
cat <- as.numeric(TTR::ROC(data[, "CAT"]))[-1]
# Draw the scatterplot of gs against the portfolio
plot(y=portfolio, x=gs)
# Add a regression line in red
abline(reg=lm(gs ~ portfolio), col="red", lwd=2)
# Plot scatterplots and regression lines to a 2x2 window
par(mfrow=c(2, 2))
plot(y=portfolio, x=gs)
abline(reg=lm(gs ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=ko)
abline(reg=lm(ko ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=dis)
abline(reg=lm(dis ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=cat)
abline(reg=lm(cat ~ portfolio), col="red", lwd=2)
par(mfrow=c(1, 1))
# In this exercise, you are given a dataset old.vs.new.portfolio with the following self-explanatory columns: old.portfolio.value, new.portfolio.value, old.portfolio.rtn, new.portfolio.rtn
tmpData <- readr::read_delim("./RInputFiles/old.vs.new.portfolio.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## old.portfolio.value = col_double(),
## new.portfolio.value = col_double(),
## old.portfolio.rtn = col_double(),
## new.portfolio.rtn = col_double()
## )
# names(tmpData) <- c("Index", "apple")
old.vs.new.portfolio <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot new and old portfolio values on same chart
plot(old.vs.new.portfolio$old.portfolio.value)
lines(old.vs.new.portfolio$new.portfolio.value, col = "red")
# Plot density of the new and old portfolio returns on same chart
plot(density(old.vs.new.portfolio$old.portfolio.rtn))
lines(density(old.vs.new.portfolio$new.portfolio.rtn), col ="red")
# Draw value, return, drawdowns of old portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, "old.portfolio.rtn"])
# Draw value, return, drawdowns of new portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, "new.portfolio.rtn"])
# Draw both portfolios on same chart
# Draw value, return, drawdowns of new portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, c("old.portfolio.rtn", "new.portfolio.rtn")])
Chapter 1 - Custom ggplot2 themes
Introduction to the data - finding stories in datasets:
Filtering and plotting the data:
Custom ggplot2 themes - providing a custom look to a chart:
Example code includes:
library(ggplot2)
load("./RInputFiles/ilo_hourly_compensation.RData")
load("./RInputFiles/ilo_working_hours.RData")
# Join both data frames
ilo_data <- ilo_hourly_compensation %>%
inner_join(ilo_working_hours, by = c("country", "year"))
# Count the resulting rows
ilo_data %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 612
# Examine ilo_data
ilo_data
## # A tibble: 612 x 4
## country year hourly_compensation working_hours
## <chr> <chr> <dbl> <dbl>
## 1 Australia 1980.0 8.44 34.6
## 2 Canada 1980.0 8.87 34.8
## 3 Denmark 1980.0 10.8 31.9
## 4 Finland 1980.0 8.61 35.6
## 5 France 1980.0 8.90 35.4
## 6 Italy 1980.0 8.09 35.7
## 7 Japan 1980.0 5.46 40.8
## 8 Korea, Rep. 1980.0 0.950 55.3
## 9 Norway 1980.0 11.8 30.4
## 10 Spain 1980.0 5.86 36.8
## # ... with 602 more rows
# Turn year into a factor
ilo_data <- ilo_data %>%
mutate(year = as.factor(as.numeric(year)))
# Turn country into a factor
ilo_data <- ilo_data %>%
mutate(country = as.factor(country))
# Examine the European countries vector
european_countries <- c('Finland', 'France', 'Italy', 'Norway', 'Spain', 'Sweden', 'Switzerland', 'United Kingdom', 'Belgium', 'Ireland', 'Luxembourg', 'Portugal', 'Netherlands', 'Germany', 'Hungary', 'Austria', 'Czech Rep.')
european_countries
## [1] "Finland" "France" "Italy" "Norway"
## [5] "Spain" "Sweden" "Switzerland" "United Kingdom"
## [9] "Belgium" "Ireland" "Luxembourg" "Portugal"
## [13] "Netherlands" "Germany" "Hungary" "Austria"
## [17] "Czech Rep."
# Only retain European countries
ilo_data <- ilo_data %>%
filter(country %in% european_countries)
# Examine the structure of ilo_data
str(ilo_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 380 obs. of 4 variables:
## $ country : Factor w/ 30 levels "Australia","Austria",..: 8 9 15 22 25 27 28 29 8 9 ...
## $ year : Factor w/ 27 levels "1980","1981",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ hourly_compensation: num 8.61 8.9 8.09 11.8 5.86 ...
## $ working_hours : num 35.6 35.4 35.7 30.4 36.8 ...
# Group and summarize the data
ilo_data %>%
group_by(year) %>%
summarize(mean_hourly_compensation = mean(hourly_compensation),
mean_working_hours = mean(working_hours))
## # A tibble: 27 x 3
## year mean_hourly_compensation mean_working_hours
## <fct> <dbl> <dbl>
## 1 1980 9.27 34.0
## 2 1981 8.69 33.6
## 3 1982 8.36 33.5
## 4 1983 7.81 33.9
## 5 1984 7.54 33.7
## 6 1985 7.79 33.7
## 7 1986 9.70 34.0
## 8 1987 12.1 33.6
## 9 1988 13.2 33.7
## 10 1989 13.1 33.5
## # ... with 17 more rows
# Filter for 2006
plot_data <- ilo_data %>%
filter(year == 2006)
# Create the scatter plot
ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation))
# Create the plot
ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation)) +
# Add labels
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
)
# Save your current plot into a variable: ilo_plot
ilo_plot <- ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation)) +
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
)
# Try out theme_minimal
ilo_plot +
theme_minimal()
# Try out any other possible theme function
ilo_plot +
theme_linedraw()
windowsFonts(Bookman=windowsFont("Bookman Old Style"))
ilo_plot <- ilo_plot +
theme_minimal() +
# Customize the "minimal" theme with another custom "theme" call
theme(
text = element_text(family = "Bookman"),
title = element_text(color = "gray25"),
plot.subtitle = element_text(size=12),
plot.caption = element_text(color = "gray30")
)
# Render the plot object
ilo_plot
ilo_plot +
# "theme" calls can be stacked upon each other, so this is already the third call of "theme"
theme(
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm")
)
Chapter 2 - Creating Custom and Unique Visualization
Visualizing aspects of data with facets:
Custom plot to emphasize change:
Polishing the dot plot:
Finalizing plots for different audiences and devices:
Example code includes:
# Filter ilo_data to retain the years 1996 and 1996
ilo_data <- ilo_data %>%
filter(year == 1996 | year == 2006)
# Again, you save the plot object into a variable so you can save typing later on
ilo_plot <- ggplot(ilo_data, aes(x = working_hours, y = hourly_compensation)) +
geom_point() +
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
) +
# Add facets here
facet_grid(facets = . ~ year)
ilo_plot
# For a starter, let's look at what you did before: adding various theme calls to your plot object
ilo_plot +
theme_minimal() +
theme(
text = element_text(family = "Bookman", color = "gray25"),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(color = "gray30"),
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm")
)
# Define your own theme function below
theme_ilo <- function() {
theme_minimal() +
theme(
text = element_text(family = "Bookman", color = "gray25"),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(color = "gray30"),
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm"))
}
# Apply your theme function
ilo_plot <- ilo_plot + theme_ilo()
# Examine ilo_plot
ilo_plot
ilo_plot +
# Add another theme call
theme(
# Change the background fill to make it a bit darker
strip.background = element_rect(fill = "gray60", color = "gray95"),
# Make text a bit bigger and change its color to white
strip.text = element_text(size = 11, color = "white")
)
# Create the dot plot
ggplot(ilo_data) +
geom_path(aes(x=working_hours, y=country))
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
# Add an arrow to each path
arrow = arrow(length = unit(1.5, "mm"), type = "closed"))
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
# Add a geom_text() geometry
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1))
)
library(forcats)
# Reorder country factor levels
ilo_data <- ilo_data %>%
# Arrange data frame
arrange(country, year) %>%
# Reorder countries by working hours in 2006
mutate(country = fct_reorder(country,
working_hours,
last))
# Plot again
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1))
)
# Save plot into an object for reuse
ilo_dot_plot <- ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
# Specify the hjust aesthetic with a conditional value
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1),
hjust = ifelse(year == "2006", 1.4, -0.4)
),
# Change the appearance of the text
size = 3,
family = "Bookman",
color = "gray25"
)
ilo_dot_plot
# Reuse ilo_dot_plot
ilo_dot_plot <- ilo_dot_plot +
# Add labels to the plot
labs(
x = "Working hours per week",
y = "Country",
title = "People work less in 2006 compared to 1996",
subtitle = "Working hours in European countries, development since 1996",
caption = "Data source: ILO, 2017"
) +
# Apply your theme
theme_ilo() +
# Change the viewport
coord_cartesian(xlim = c(25, 41))
# View the plot
ilo_dot_plot
# Compute temporary data set for optimal label placement
median_working_hours <- ilo_data %>%
group_by(country) %>%
summarize(median_working_hours_per_country = median(working_hours)) %>%
ungroup()
# Have a look at the structure of this data set
str(median_working_hours)
## Classes 'tbl_df', 'tbl' and 'data.frame': 17 obs. of 2 variables:
## $ country : Factor w/ 30 levels "Netherlands",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ median_working_hours_per_country: num 27 27.8 28.4 31 30.9 ...
ilo_dot_plot +
# Add label for country
geom_text(data = median_working_hours,
aes(y = country,
x = median_working_hours_per_country,
label = country),
vjust = -0.5,
size=3,
family = "Bookman",
color = "gray25") +
# Remove axes and grids
theme(
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
panel.grid = element_blank(),
# Also, let's reduce the font size of the subtitle
plot.subtitle = element_text(size = 9)
)
Chapter 3 - Introduction to R Markdown
What is R Markdown?
Formatting with R Markdown:
R Code in R Markdown Documents:
Images in R Markdown Files:
Example code is contained in the summary Excel worksheet.
Chapter 4 - Customizing R Markdown Reports
Advanced YAML Settings (YAML is a recursive name meaning YAML and Markup Language):
Custom stylesheets - creating a unique theme for a report:
Beautiful tables:
Summary:
Example code is contained in the summary Excel worksheet.
Chapter 1 - Binomial Distribution
Flipping coins in R - for example, rbinom(1, 1, 0.5) - 1 draw of 1 coint with 50% of being heads:
Density and cumulative density:
Expected value and variance:
Example code includes:
# Generate 10 separate random flips with probability .3
rbinom(10, 1, 0.3)
## [1] 0 1 0 0 1 1 0 0 1 0
# Generate 100 occurrences of flipping 10 coins, each with 30% probability
rbinom(100, 10, 0.3)
## [1] 2 1 6 4 2 3 3 6 8 5 1 1 3 7 1 5 4 6 4 3 4 2 4 2 4 1 2 5 1 7 2 5 2 5 3
## [36] 4 5 2 3 3 0 4 3 3 5 2 4 1 2 3 2 1 4 5 4 0 5 6 5 2 1 2 3 2 2 4 2 5 3 5
## [71] 3 4 1 2 4 1 3 2 6 3 4 2 4 6 6 2 2 2 4 6 4 4 2 1 4 3 0 4 3 3
# Calculate the probability that 2 are heads using dbinom
dbinom(2, 10, 0.3)
## [1] 0.2334744
# Confirm your answer with a simulation using rbinom
mean(rbinom(10000, 10, 0.3) == 2)
## [1] 0.2353
# Calculate the probability that at least five coins are heads
1 - pbinom(4, 10, 0.3)
## [1] 0.1502683
# Confirm your answer with a simulation of 10,000 trials
mean(rbinom(10000, 10, 0.3) >= 5)
## [1] 0.1533
# Here is how you computed the answer in the last problem
mean(rbinom(10000, 10, .3) >= 5)
## [1] 0.149
# Try now with 100, 1000, 10,000, and 100,000 trials
mean(rbinom(100, 10, .3) >= 5)
## [1] 0.16
mean(rbinom(1000, 10, .3) >= 5)
## [1] 0.158
mean(rbinom(10000, 10, .3) >= 5)
## [1] 0.1518
mean(rbinom(100000, 10, .3) >= 5)
## [1] 0.15187
# Calculate the expected value using the exact formula
25 * 0.3
## [1] 7.5
# Confirm with a simulation using rbinom
mean(rbinom(10000, 25, 0.3))
## [1] 7.4447
# Calculate the variance using the exact formula
25 * 0.3 * (1 - 0.3)
## [1] 5.25
# Confirm with a simulation using rbinom
var(rbinom(10000, 25, 0.3))
## [1] 5.15845
Chapter 2 - Laws of Probability
Probability of Event A and Event B:
Probability of A or B:
Multiplying random variables:
Adding random variables:
Example code includes:
# Simulate 100,000 flips of a coin with a 40% chance of heads
A <- rbinom(100000, 1, 0.4)
# Simulate 100,000 flips of a coin with a 20% chance of heads
B <- rbinom(100000, 1, 0.2)
# Estimate the probability both A and B are heads
mean(A & B)
## [1] 0.0805
# You've already simulated 100,000 flips of coins A and B
A <- rbinom(100000, 1, .4)
B <- rbinom(100000, 1, .2)
# Simulate 100,000 flips of coin C (70% chance of heads)
C <- rbinom(100000, 1, .7)
# Estimate the probability A, B, and C are all heads
mean(A & B & C)
## [1] 0.05589
# Simulate 100,000 flips of a coin with a 60% chance of heads
A <- rbinom(100000, 1, 0.6)
# Simulate 100,000 flips of a coin with a 10% chance of heads
B <- rbinom(100000, 1, 0.1)
# Estimate the probability either A or B is heads
mean(A | B)
## [1] 0.63736
# Use rbinom to simulate 100,000 draws from each of X and Y
X <- rbinom(100000, 10, 0.6)
Y <- rbinom(100000, 10, 0.7)
# Estimate the probability either X or Y is <= to 4
mean((X <= 4) | (Y <= 4))
## [1] 0.20613
# Use pbinom to calculate the probabilities separately
prob_X_less <- pbinom(4, 10, 0.6)
prob_Y_less <- pbinom(4, 10, 0.7)
# Combine these to calculate the exact probability either <= 4
prob_X_less + prob_Y_less - prob_X_less * prob_Y_less
## [1] 0.2057164
# Simulate 100,000 draws of a binomial with size 20 and p = .1
X <- rbinom(100000, 20, 0.1)
# Estimate the expected value of X
mean(X)
## [1] 1.9991
# Estimate the expected value of 5 * X
mean(5 * X)
## [1] 9.9955
# Estimate the variance of X
var(X)
## [1] 1.786197
# Estimate the variance of 5 * X
var(5 * X)
## [1] 44.65493
# Simulate 100,000 draws of X (size 20, p = .3) and Y (size 40, p = .1)
X <- rbinom(100000, 20, 0.3)
Y <- rbinom(100000, 40, 0.1)
# Estimate the expected value of X + Y
mean(X + Y)
## [1] 9.99048
# Find the variance of X + Y
var(X + Y)
## [1] 7.798627
# Find the variance of 3 * X + Y
var(3 * X + Y)
## [1] 41.20331
Chapter 3 - Bayesian Statistics
Updating with evidence:
Prior probability - may not be equal odds prior to seeing any evidence:
Bayes theorem:
Example code includes:
# Simulate 50000 cases of flipping 20 coins from fair and from biased
fair <- rbinom(50000, 20, 0.5)
biased <- rbinom(50000, 20, 0.75)
# How many fair cases, and how many biased, led to exactly 11 heads?
fair_11 <- sum(fair == 11)
biased_11 <- sum(biased == 11)
# Find the fraction of fair coins that are 11 out of all coins that were 11
fair_11 / (fair_11 + biased_11)
## [1] 0.8487457
# How many fair cases, and how many biased, led to exactly 16 heads?
fair_16 <- sum(fair == 16)
biased_16 <- sum(biased == 16)
# Find the fraction of fair coins that are 16 out of all coins that were 16
fair_16 / (fair_16 + biased_16)
## [1] 0.02418033
# Simulate 8000 cases of flipping a fair coin, and 2000 of a biased coin
fair_flips <- rbinom(8000, 20, 0.5)
biased_flips <- rbinom(2000, 20, 0.75)
# Find the number of cases from each coin that resulted in 14/20
fair_14 <- sum(fair_flips == 14)
biased_14 <- sum(biased_flips == 14)
# Use these to estimate the posterior probability
fair_14 / (fair_14 + biased_14)
## [1] 0.4651515
# Simulate 80,000 draws from fair coin, 10,000 from each of high and low coins
flips_fair <- rbinom(80000, 20, 0.5)
flips_high <- rbinom(10000, 20, 0.75)
flips_low <- rbinom(10000, 20, 0.25)
# Compute the number of coins that resulted in 14 heads from each of these piles
fair_14 <- sum(flips_fair == 14)
high_14 <- sum(flips_high == 14)
low_14 <- sum(flips_low == 14)
# Compute the posterior probability that the coin was fair
fair_14 / (fair_14 + high_14 + low_14)
## [1] 0.6370197
# Use dbinom to calculate the probability of 11/20 heads with fair or biased coin
probability_fair <- dbinom(11, 20, 0.5)
probability_biased <- dbinom(11, 20, 0.75)
# Calculate the posterior probability that the coin is fair
probability_fair / (probability_fair + probability_biased)
## [1] 0.8554755
# Find the probability that a coin resulting in 14/20 is fair
probability_fair <- dbinom(14, 20, .5)
probability_biased <- dbinom(14, 20, .75)
probability_fair / (probability_fair + probability_biased)
## [1] 0.179811
# Find the probability that a coin resulting in 18/20 is fair
probability_fair <- dbinom(18, 20, .5)
probability_biased <- dbinom(18, 20, .75)
probability_fair / (probability_fair + probability_biased)
## [1] 0.002699252
# Use dbinom to find the probability of 16/20 from a fair or biased coin
probability_16_fair <- dbinom(16, 20, 0.5)
probability_16_biased <- dbinom(16, 20, 0.75)
# Use Bayes' theorem to find the posterior probability that the coin is fair
(probability_16_fair * 0.99) / (probability_16_fair * 0.99 + probability_16_biased * 0.01)
## [1] 0.7068775
Chapter 4 - Related Distributions
Normal distribution - symmetrical bell curve, Gaussian:
Poisson distribution - approximates the binomial under the assumption of a large number of trials each with a low probability:
Geometric distribution - example of flipping a coin with probability p and assessing when the first success occurs:
Example code includes:
compare_histograms <- function(variable1, variable2) {
x <- data.frame(value = variable1, variable = "Variable 1")
y <- data.frame(value = variable2, variable = "Variable 2")
ggplot(rbind(x, y), aes(value)) +
geom_histogram() +
facet_wrap(~ variable, nrow = 2)
}
# Draw a random sample of 100,000 from the Binomial(1000, .2) distribution
binom_sample <- rbinom(100000, 1000, 0.2)
# Draw a random sample of 100,000 from the normal approximation
normal_sample <- rnorm(100000, 200, sqrt(160))
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, normal_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Use binom_sample to estimate the probability of <= 190 heads
mean(binom_sample <= 190)
## [1] 0.2292
# Use normal_sample to estimate the probability of <= 190 heads
mean(normal_sample <= 190)
## [1] 0.2152
# Calculate the probability of <= 190 heads with pbinom
pbinom(190, 1000, 0.2)
## [1] 0.2273564
# Calculate the probability of <= 190 heads with pnorm
pnorm(190, 200, sqrt(160))
## [1] 0.2145977
# Draw a random sample of 100,000 from the Binomial(10, .2) distribution
binom_sample <- rbinom(100000, 10, 0.2)
# Draw a random sample of 100,000 from the normal approximation
normal_sample <- rnorm(100000, 2, sqrt(1.6))
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, normal_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Draw a random sample of 100,000 from the Binomial(1000, .002) distribution
binom_sample <- rbinom(100000, 1000, 0.002)
# Draw a random sample of 100,000 from the Poisson approximation
poisson_sample <- rpois(100000, 2)
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, poisson_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the percentage of simulated values that are 0
mean(poisson_sample == 0)
## [1] 0.13513
# Use dpois to find the exact probability that a draw is 0
dpois(0, 2)
## [1] 0.1353353
# Simulate 100,000 draws from Poisson(1)
X <- rpois(100000, 1)
# Simulate 100,000 draws from Poisson(2)
Y <- rpois(100000, 2)
# Add X and Y together to create Z
Z <- X + Y
# Use compare_histograms to compare Z to the Poisson(3)
compare_histograms(Z, rpois(100000, 3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Simulate 100 instances of flipping a 20% coin
flips <- rbinom(100, 1, 0.2)
# Use which to find the first case of 1 ("heads")
which(flips == 1)[1]
## [1] 6
# Existing code for finding the first instance of heads
which(rbinom(100, 1, .2) == 1)[1]
## [1] 5
# Replicate this 100,000 times using replicate()
replications <- replicate(100000, which(rbinom(100, 1, .2) == 1)[1])
# Histogram the replications with qplot
qplot(replications)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Generate 100,000 draws from the corresponding geometric distribution
geom_sample <- rgeom(100000, 0.2)
# Compare the two distributions with compare_histograms
compare_histograms(replications, geom_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the probability the machine breaks on 5th day or earlier
pgeom(4, 0.1)
## [1] 0.40951
# Find the probability the machine is still working on 20th day
1 - pgeom(19, 0.1)
## [1] 0.1215767
# Calculate the probability of machine working on day 1-30
still_working <- 1 - pgeom(0:29, 0.1)
# Plot the probability for days 1 to 30
qplot(1:30, still_working)
Chapter 1 - Bootstrapping for Parameter Estimates
Introduction - beginning with bootstrapping approach:
Percentile and standard error methods:
Re-centering bootstrap distributions for hypothesis testing:
Example code includes:
manhattan <- readr::read_csv("./RInputFiles/manhattan.csv")
## Parsed with column specification:
## cols(
## rent = col_integer()
## )
# Will need to either call library(infer) or add infer:: to this code
library(infer)
# Generate bootstrap distribution of medians
rent_ci_med <- manhattan %>%
# Specify the variable of interest
specify(response = rent) %>%
# Generate 15000 bootstrap samples
generate(reps = 15000, type = "bootstrap") %>%
# Calculate the median of each bootstrap sample
calculate(stat = "median")
# View the structure of rent_ci_med
str(rent_ci_med)
## Classes 'tbl_df', 'tbl' and 'data.frame': 15000 obs. of 2 variables:
## $ replicate: int 1 2 3 4 5 6 7 8 9 10 ...
## $ stat : num 2422 2350 2262 2325 2350 ...
## - attr(*, "response")= symbol rent
# Plot a histogram of rent_ci_med
ggplot(rent_ci_med, aes(x=stat)) +
geom_histogram(binwidth=50)
# Percentile method
rent_ci_med %>%
summarize(l = quantile(stat, 0.025),
u = quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 2162 2875
# Standard error method
# Calculate observed median
rent_med_obs <- manhattan %>%
# Calculate observed median rent
summarize(median(rent)) %>%
# Extract numerical value
pull()
# Determine critical value
t_star <- qt(0.975, df = nrow(manhattan) - 1)
# Construct interval
rent_ci_med %>%
summarize(boot_se = sd(rent_ci_med$stat)) %>%
summarize(l = rent_med_obs - t_star * boot_se,
u = rent_med_obs + t_star * boot_se)
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 1994 2706
data(ncbirths, package="openintro")
str(ncbirths)
## 'data.frame': 1000 obs. of 13 variables:
## $ fage : int NA NA 19 21 NA NA 18 17 NA 20 ...
## $ mage : int 13 14 15 15 15 15 15 15 16 16 ...
## $ mature : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
## $ weeks : int 39 42 37 41 39 38 37 35 38 37 ...
## $ premie : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
## $ visits : int 10 15 11 6 9 19 12 5 9 13 ...
## $ marital : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
## $ gained : int 38 20 38 34 27 22 76 15 NA 52 ...
## $ weight : num 7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
## $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
## $ habit : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
## $ whitemom : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...
# Remove NA visits
ncbirths_complete_visits <- ncbirths %>%
filter(!is.na(visits))
# Generate 15000 bootstrap means
visit_ci_mean <- ncbirths_complete_visits %>%
specify(response=visits) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="mean")
# Calculate the 90% CI via percentile method
visit_ci_mean %>%
summarize(l = quantile(stat, 0.05),
u = quantile(stat, 0.95))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 11.9 12.3
# Calculate 15000 bootstrap SDs
visit_ci_sd <- ncbirths_complete_visits %>%
specify(response=visits) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="sd")
# Calculate the 90% CI via percentile method
visit_ci_sd %>%
summarize(l = quantile(stat, 0.05),
u = quantile(stat, 0.95))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 3.74 4.16
# Generate 15000 bootstrap samples centered at null
rent_med_ht <- manhattan %>%
specify(response = rent) %>%
hypothesize(null = "point", med = 2500) %>%
generate(reps = 15000, type = "bootstrap") %>%
calculate(stat = "median")
# Calculate observed median
rent_med_obs <- manhattan %>%
summarize(median(rent)) %>%
pull()
# Calculate p-value
rent_med_ht %>%
filter(stat > rent_med_obs) %>%
summarize(n() / 15000)
## # A tibble: 1 x 1
## `n()/15000`
## <dbl>
## 1 0.948
# Generate 1500 bootstrap means centered at null
weight_mean_ht <- ncbirths %>%
specify(response = weight) %>%
hypothesize(null = "point", mu = 7) %>%
generate(reps=1500, type="bootstrap") %>%
calculate(stat="mean")
# Calculate observed mean
weight_mean_obs <- ncbirths %>%
summarize(mean(weight)) %>%
pull()
# Calculate p-value
weight_mean_ht %>%
filter(stat > weight_mean_obs) %>%
summarize((n()/1500) * 2)
## # A tibble: 1 x 1
## `(n()/1500) * 2`
## <dbl>
## 1 0.0253
Chapter 2 - Introducing the t-distribution
The t-distribution - especially useful when the population standard deviation is unknown (as is typically the case):
Estimating a mean with a t-interval:
The t-interval for paired data:
Testing a mean with a t-test:
Example code includes:
# P(T < 3) for df = 10
(x <- pt(3, df = 10))
## [1] 0.9933282
# P(T > 3) for df = 10
(y <- 1 - pt(3, df=10))
## [1] 0.006671828
# P(T > 3) for df = 100
(z <- 1 - pt(3, df=100))
## [1] 0.001703958
# Comparison
y == z
## [1] FALSE
y > z
## [1] TRUE
y < z
## [1] FALSE
# 95th percentile for df = 10
(x <- qt(0.95, df = 10))
## [1] 1.812461
# upper bound of middle 95th percent for df = 10
(y <- qt(0.975, df = 10))
## [1] 2.228139
# upper bound of middle 95th percent for df = 100
(z <- qt(0.975, df = 100))
## [1] 1.983972
# Comparison
y == z
## [1] FALSE
y > z
## [1] TRUE
y < z
## [1] FALSE
data(acs12, package="openintro")
# Subset for employed respondents
acs12_emp <- acs12 %>%
filter(employment == "employed")
# Construct 95% CI for avg time_to_work
t.test(acs12_emp$time_to_work, conf.level=0.95)
##
## One Sample t-test
##
## data: acs12_emp$time_to_work
## t = 32.635, df = 782, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 24.43369 27.56120
## sample estimates:
## mean of x
## 25.99745
t.test(acs12_emp$hrs_work, conf.level=0.95)
##
## One Sample t-test
##
## data: acs12_emp$hrs_work
## t = 87.521, df = 842, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 38.05811 39.80429
## sample estimates:
## mean of x
## 38.9312
data(textbooks, package="openintro")
# 90% CI
t.test(textbooks$diff, conf.level = 0.9)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 90 percent confidence interval:
## 9.981505 15.541783
## sample estimates:
## mean of x
## 12.76164
# 95% CI
t.test(textbooks$diff, conf.level = 0.95)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 9.435636 16.087652
## sample estimates:
## mean of x
## 12.76164
# 99% CI
t.test(textbooks$diff, conf.level = 0.99)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 99 percent confidence interval:
## 8.347154 17.176133
## sample estimates:
## mean of x
## 12.76164
# Conduct HT
t.test(textbooks$diff, mu=0, alternative="two.sided", conf.level=0.95)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 9.435636 16.087652
## sample estimates:
## mean of x
## 12.76164
# Calculate 15000 bootstrap means
textdiff_med_ci <- textbooks %>%
specify(response = diff) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat = "median")
# Calculate the 95% CI via percentile method
textdiff_med_ci %>%
summarize(l=quantile(stat, 0.025),
u=quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 5.04 11.7
data(hsb2, package="openintro")
# Calculate diff
hsb2 <- hsb2 %>%
mutate(diff = math - science)
# Generate 15000 bootstrap means centered at null
scorediff_med_ht <- hsb2 %>%
specify(response=diff) %>%
hypothesize(null="point", mu=0) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="median")
# Calculate observed median of differences
scorediff_med_obs <- hsb2 %>%
summarize(median(diff)) %>%
pull()
# Calculate p-value
scorediff_med_ht %>%
filter(stat > scorediff_med_obs) %>%
summarize(p_val = (n() / 15000) * 2)
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0.529
Chapter 3 - Inference for Difference in Two Parameters
Hypothesis testing for comparing two means:
Bootstrap CI for difference in two means:
Comparing means with a t-test:
Example code includes:
data(stem.cell, package="openintro")
str(stem.cell)
## 'data.frame': 18 obs. of 3 variables:
## $ trmt : Factor w/ 2 levels "ctrl","esc": 1 1 1 1 1 1 1 1 1 2 ...
## $ before: num 35.2 36.5 39.8 39.8 41.8 ...
## $ after : num 29.5 29.5 36.2 38 37.5 ...
# Calculate difference between before and after
stem.cell <- stem.cell %>%
mutate(change = after - before)
# Calculate observed difference in means
diff_mean <- stem.cell %>%
# Group by treatment group
group_by(trmt) %>%
# Calculate mean change for each group
summarize(mean_change = mean(change)) %>%
# Extract
pull() %>%
# Calculate difference
diff()
# Generate 1000 differences in means via randomization
diff_ht_mean <- stem.cell %>%
# y ~ x
specify(change ~ trmt) %>%
# Null = no difference between means
hypothesize(null = "independence") %>%
# Shuffle labels 1000 times
generate(reps = 1000, type = "permute") %>%
# Calculate test statistic
calculate(stat = "diff in means", order=rev(levels(stem.cell$trmt)))
# Calculate p-value
diff_ht_mean %>%
# Identify simulated test statistics at least as extreme as observed
filter(stat > diff_mean) %>%
# Calculate p-value
summarize(p_val = (n() / 1000))
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0
# Remove subjects with missing habit
ncbirths_complete_habit <- ncbirths %>%
filter(!is.na(habit))
# Calculate observed difference in means
diff_mean <- ncbirths_complete_habit %>%
# Group by habit group
group_by(habit) %>%
# Calculate mean weight for each group
summarize(mean_weight = mean(weight)) %>%
# Extract
pull() %>%
# Calculate difference
diff()
# Generate 1000 differences in means via randomization
diff_ht_mean <- ncbirths_complete_habit %>%
# y ~ x
specify(weight ~ habit) %>%
# Null = no difference between means
hypothesize(null = "independence") %>%
# Shuffle labels 1000 times
generate(reps = 1000, type = "permute") %>%
# Calculate test statistic
calculate(stat = "diff in means", order=rev(levels(ncbirths_complete_habit$habit)))
# Calculate p-value
diff_ht_mean %>%
# Identify simulated test statistics at least as extreme as observed
filter(stat < diff_mean) %>%
# Calculate p-value
summarize(p_val = (n()/1000) * 2)
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0.0280
# Generate 1500 bootstrap difference in means
diff_mean_ci <- ncbirths_complete_habit %>%
specify(weight ~ habit) %>%
generate(reps = 1500, type = "bootstrap") %>%
calculate(stat = "diff in means", order=rev(levels(ncbirths_complete_habit$habit)))
# Calculate the 95% CI via percentile method
diff_mean_ci %>%
summarize(l=quantile(stat, 0.025),
u=quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 -0.583 -0.0530
# Remove subjects with missing habit and weeks
ncbirths_complete_habit_weeks <- ncbirths %>%
filter(!is.na(habit) & !is.na(weeks))
# Generate 1500 bootstrap difference in medians
diff_med_ci <- ncbirths_complete_habit_weeks %>%
specify(weeks ~ habit) %>%
generate(reps = 1500, type = "bootstrap") %>%
calculate(stat="diff in medians", order=rev(levels(ncbirths_complete_habit_weeks$habit)))
# Calculate the 92% CI via percentile method
diff_med_ci %>%
summarize(l=quantile(stat, 0.04),
u=quantile(stat, 0.96))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 -1.00 0
# Create hrly_pay and filter for non-missing hrly_pay and citizen
acs12_complete_hrlypay_citizen <- acs12 %>%
mutate(hrly_pay = income / (hrs_work * 52)) %>%
filter(
!is.na(hrly_pay),
!is.na(citizen)
)
# Calculate percent missing
new_n <- nrow(acs12_complete_hrlypay_citizen)
old_n <- nrow(acs12)
(perc_missing <- (old_n - new_n) / old_n)
## [1] 0.5205
# Calculate summary statistics
acs12_complete_hrlypay_citizen %>%
group_by(citizen) %>%
summarize(
x_bar = mean(hrly_pay),
s = sd(hrly_pay),
n = n()
)
## # A tibble: 2 x 4
## citizen x_bar s n
## <fct> <dbl> <dbl> <int>
## 1 no 21.2 34.5 58
## 2 yes 18.5 24.7 901
# Plot the distributions
ggplot(data = acs12_complete_hrlypay_citizen, mapping = aes(x = hrly_pay)) +
geom_histogram(binwidth = 5) +
facet_grid(. ~ citizen, labeller = labeller(citizen = c(no = "Non citizen",
yes = "Citizen")))
# Construct 95% CI
t.test(hrly_pay ~ citizen, data=acs12_complete_hrlypay_citizen, null=0, alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: hrly_pay by citizen
## t = 0.58058, df = 60.827, p-value = 0.5637
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.53483 11.88170
## sample estimates:
## mean in group no mean in group yes
## 21.19494 18.52151
Chapter 4 - Comparing Many Means
Vocabulary score vary between social class:
ANOVA - Analysis of Variance:
Conditions for ANOVA:
Post-hoc testing - determining which of the means are different:
Wrap-up:
Example code includes:
gss <- readr::read_csv("./RInputFiles/gss_wordsum_class.csv")
## Parsed with column specification:
## cols(
## wordsum = col_integer(),
## class = col_character()
## )
str(gss)
## Classes 'tbl_df', 'tbl' and 'data.frame': 795 obs. of 2 variables:
## $ wordsum: int 6 9 6 5 6 6 8 10 8 9 ...
## $ class : chr "MIDDLE" "WORKING" "WORKING" "WORKING" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ wordsum: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ class : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
ggplot(gss, aes(x=wordsum)) +
geom_histogram(binwidth=1) +
facet_grid(class ~ .)
aov_wordsum_class <- aov(wordsum ~ class, data=gss)
broom::tidy(aov_wordsum_class)
## term df sumsq meansq statistic p.value
## 1 class 3 236.5644 78.854810 21.73467 1.560565e-13
## 2 Residuals 791 2869.8003 3.628066 NA NA
gss %>%
group_by(class) %>%
summarize(s = sd(wordsum))
## # A tibble: 4 x 2
## class s
## <chr> <dbl>
## 1 LOWER 2.24
## 2 MIDDLE 1.89
## 3 UPPER 2.34
## 4 WORKING 1.87
# Conduct the pairwise.t.test with p.adjust = "none" option (we'll adjust the significance level, not the p-value). The first argument is the response vector and the second argument is the grouping vector.
pairwise.t.test(gss$wordsum, gss$class, p.adjust = "none") %>%
broom::tidy()
## group1 group2 p.value
## 1 MIDDLE LOWER 1.133345e-07
## 2 UPPER LOWER 4.752521e-02
## 3 WORKING LOWER 3.055619e-02
## 5 UPPER MIDDLE 2.395734e-01
## 6 WORKING MIDDLE 1.631637e-12
## 9 WORKING UPPER 3.670775e-01
Chapter 1 - Introduction to Correlation Coefficients
How are correlation coefficients calculated?
Usefulness of correlation coefficients:
Points of caution:
Example code includes:
PE <- read.table("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.04.txt", header=TRUE)
# Take a quick peek at both vectors
(A <- c(1, 2, 3))
## [1] 1 2 3
(B <- c(3, 6, 7))
## [1] 3 6 7
# Save the differences of each vector element with the mean in a new variable
diff_A <- A - mean(A)
diff_B <- B - mean(B)
# Do the summation of the elements of the vectors and divide by N-1 in order to acquire the covariance between the two vectors
cov <- sum(diff_A*diff_B)/ (length(A)-1)
# Square the differences that were found in the previous step
sq_diff_A <- diff_A ** 2
sq_diff_B <- diff_B ** 2
# Take the sum of the elements, divide them by N-1 and consequently take the square root to acquire the sample standard deviations
sd_A <- sqrt(sum(sq_diff_A)/(length(A)-1))
sd_B <- sqrt(sum(sq_diff_B)/(length(B)-1))
# Combine all the pieces of the puzzle
correlation <- cov / (sd_A * sd_B)
correlation
## [1] 0.9607689
# Check the validity of your result with the cor() command
cor(A, B)
## [1] 0.9607689
# Read data from a URL into a dataframe called PE (physical endurance) - moved above to cache
# PE <- read.table("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.04.txt", header=TRUE)
# Summary statistics
psych::describe(PE)
## vars n mean sd median trimmed mad min max range skew
## pid 1 200 101.81 58.85 101.5 101.71 74.87 1 204 203 0.01
## age 2 200 49.41 10.48 48.0 49.46 10.38 20 82 62 0.06
## activeyears 3 200 10.68 4.69 11.0 10.57 4.45 0 26 26 0.30
## endurance 4 200 26.50 10.84 27.0 26.22 10.38 3 55 52 0.22
## kurtosis se
## pid -1.21 4.16
## age -0.14 0.74
## activeyears 0.46 0.33
## endurance -0.44 0.77
# Scatter plots
plot(PE$age ~ PE$activeyears)
plot(PE$endurance ~ PE$activeyears)
plot(PE$endurance ~ PE$age)
# Correlation Analysis
round(cor(PE[, !(names(PE) == "pid")]), 2)
## age activeyears endurance
## age 1.00 0.33 -0.08
## activeyears 0.33 1.00 0.33
## endurance -0.08 0.33 1.00
# Do some correlation tests. If the null hypothesis of no correlation can be rejected on a significance level of 5%, then the relationship between variables is significantly different from zero at the 95% confidence level
cor.test(PE$age, PE$activeyears)
##
## Pearson's product-moment correlation
##
## data: PE$age and PE$activeyears
## t = 4.9022, df = 198, p-value = 1.969e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1993491 0.4473145
## sample estimates:
## cor
## 0.3289909
cor.test(PE$endurance, PE$activeyears)
##
## Pearson's product-moment correlation
##
## data: PE$endurance and PE$activeyears
## t = 4.8613, df = 198, p-value = 2.37e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1967110 0.4451154
## sample estimates:
## cor
## 0.3265402
cor.test(PE$endurance, PE$age)
##
## Pearson's product-moment correlation
##
## data: PE$endurance and PE$age
## t = -1.1981, df = 198, p-value = 0.2323
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.22097811 0.05454491
## sample estimates:
## cor
## -0.08483813
# The impact dataset is already loaded in
rawImpactData <- " 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, 95, 90, 87, 84, 92, 89, 78, 97, 93, 90, 89, 97, 79, 86, 85, 85, 98, 95, 96, 92, 79, 85, 97, 89, 75, 75, 84, 93, 88, 97, 93, 96, 84, 89, 95, 95, 97, 95, 92, 95, 88, 82, 77, 72, 77, 79, 63, 82, 85, 66, 76, 79, 60, 59, 60, 76, 85, 83, 67, 84, 81, 85, 91, 74, 63, 68, 78, 74, 80, 73, 74, 70, 81, 72, 90, 74, 70, 63, 65, 69, 35.29, 31.47, 30.87, 41.87, 33.28, 40.73, 38.09, 31.65, 39.59, 30.53, 33.65, 37.51, 40.39, 32.88, 33.39, 35.13, 38.51, 29.64, 35.32, 27.36, 27.19, 32.66, 26.29, 28.92, 32.77, 32.92, 34.26, 36.08, 31.63, 28.89, 35.81, 33.61, 34.46, 39.18, 33.14, 33.03, 39.01, 35.06, 30.58, 38.45, 0.42, 0.63, 0.56, 0.66, 0.56, 0.81, 0.66, 0.79, 0.68, 0.60, 0.74, 0.51, 0.82, 0.59, 0.82, 0.63, 0.73, 0.57, 0.65, 1.00, 0.57, 0.71, 0.82, 0.61, 0.72, 0.50, 0.54, 0.65, 0.66, 0.71, 0.55, 0.79, 0.48, 0.55, 1.20, 0.73, 0.60, 0.84, 0.60, 0.42, 11, 7, 8, 7, 7, 6, 6, 10, 7, 10, 7, 7, 12, 2, 9, 10, 10, 8, 5, 11, 7, 9, 9, 9, 8, 9, 6, 10, 9, 7, 9, 7, 7, 10, 10, 11, 10, 5, 8, 11, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 97, 86, 90, 85, 87, 91, 90, 94, 91, 93, 92, 89, 84, 81, 85, 87, 96, 93, 95, 93, 63, 79, 91, 85, 74, 72, 80, 59, 75, 90, 66, 85, 72, 82, 80, 59, 74, 62, 67, 66, 86, 80, 79, 70, 77, 85, 60, 72, 83, 68, 72, 79, 67, 71, 61, 72, 78, 85, 67, 80, 75, 79, 80, 72, 56, 66, 74, 69, 79, 73, 69, 61, 79, 66, 80, 70, 62, 54, 57, 63, 35.61, 37.01, 20.15, 33.26, 28.34, 33.47, 44.28, 36.14, 37.42, 25.19, 23.63, 26.32, 43.70, 32.40, 39.32, 35.62, 39.95, 35.62, 30.21, 30.37, 29.23, 44.45, 26.12, 27.98, 60.77, 31.91, 49.62, 35.68, 55.67, 25.70, 35.21, 33.01, 37.46, 53.20, 33.20, 34.59, 39.66, 35.09, 32.30, 44.49, 0.65, 0.49, 0.75, 0.19, 0.59, 0.48, 0.77, 0.90, 0.65, 0.59, 0.55, 0.56, 0.57, 0.69, 0.73, 0.48, 0.43, 0.37, 0.47, 0.50, 0.61, 0.65, 1.12, 0.65, 0.71, 0.79, 0.64, 0.70, 0.68, 0.73, 0.58, 0.97, 0.56, 0.51, 1.30, 0.70, 0.74, 1.24, 0.65, 0.98, 10, 7, 9, 8, 8, 5, 6, 10, 8, 11, 9, 9, 10, 3, 10, 12, 10, 9, 5, 11, 3, 6, 5, 5, 1, 9, 7, 11, 6, 3, 4, 3, 1, 7, 7, 4, 5, 2, 6, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 26, 34, 27, 22, 26, 35, 43, 31, 39, 25, 31, 38, 14, 16, 33, 13, 27, 15, 19, 39"
rawImpactNames <- c('subject', 'condition', 'vermem1', 'vismem1', 'vms1', 'rt1', 'ic1', 'sym1', 'vermem2', 'vismem2', 'vms2', 'rt2', 'ic2', 'sym2')
splitImpactData <- stringr::str_split(rawImpactData, ",")
impactRawMatrix <- matrix(data=splitImpactData[[1]], ncol=length(rawImpactNames))
colnames(impactRawMatrix) <- rawImpactNames
rawImpactDF <- as.data.frame(impactRawMatrix, stringsAsFactors=FALSE)
for (intCtr in c(1, 3:ncol(rawImpactDF))) { rawImpactDF[, intCtr] <- as.numeric(rawImpactDF[, intCtr]) }
rawImpactDF$condition <- factor(stringr::str_replace_all(rawImpactDF$condition, " ", ""))
impact <- rawImpactDF
# Summary statistics entire dataset
psych::describe(impact)
## vars n mean sd median trimmed mad min max range
## subject 1 40 20.50 11.69 20.50 20.50 14.83 1.00 40.00 39.00
## condition* 2 40 1.50 0.51 1.50 1.50 0.74 1.00 2.00 1.00
## vermem1 3 40 89.75 6.44 91.00 90.44 6.67 75.00 98.00 23.00
## vismem1 4 40 74.88 8.60 75.00 74.97 9.64 59.00 91.00 32.00
## vms1 5 40 34.03 3.90 33.50 34.02 3.62 26.29 41.87 15.58
## rt1 6 40 0.67 0.15 0.65 0.66 0.13 0.42 1.20 0.78
## ic1 7 40 8.28 2.05 8.50 8.38 2.22 2.00 12.00 10.00
## sym1 8 40 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00
## vermem2 9 40 82.00 11.02 85.00 82.97 9.64 59.00 97.00 38.00
## vismem2 10 40 71.90 8.42 72.00 72.19 10.38 54.00 86.00 32.00
## vms2 11 40 35.83 8.66 35.15 34.98 6.89 20.15 60.77 40.62
## rt2 12 40 0.67 0.22 0.65 0.65 0.13 0.19 1.30 1.11
## ic2 13 40 6.75 2.98 7.00 6.81 2.97 1.00 12.00 11.00
## sym2 14 40 13.88 15.32 7.00 12.38 10.38 0.00 43.00 43.00
## skew kurtosis se
## subject 0.00 -1.29 1.85
## condition* 0.00 -2.05 0.08
## vermem1 -0.70 -0.51 1.02
## vismem1 -0.11 -0.96 1.36
## vms1 0.08 -0.75 0.62
## rt1 1.14 2.21 0.02
## ic1 -0.57 0.36 0.32
## sym1 3.98 14.16 0.03
## vermem2 -0.65 -0.81 1.74
## vismem2 -0.28 -0.87 1.33
## vms2 0.86 0.65 1.37
## rt2 0.93 1.29 0.03
## ic2 -0.16 -1.06 0.47
## sym2 0.44 -1.47 2.42
# Calculate correlation coefficient
entirecorr <- round(cor(impact$vismem2, impact$vermem2), 2)
# Summary statistics subsets
psych::describeBy(impact, impact$condition)
##
## Descriptive statistics by group
## group: concussed
## vars n mean sd median trimmed mad min max range
## subject 1 20 30.50 5.92 30.50 30.50 7.41 21.00 40.00 19.00
## condition* 2 20 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00
## vermem1 3 20 89.65 7.17 92.50 90.56 5.93 75.00 97.00 22.00
## vismem1 4 20 74.75 8.03 74.00 74.25 8.15 63.00 91.00 28.00
## vms1 5 20 33.20 3.62 33.09 33.27 3.32 26.29 39.18 12.89
## rt1 6 20 0.66 0.17 0.63 0.64 0.13 0.42 1.20 0.78
## ic1 7 20 8.55 1.64 9.00 8.62 1.48 5.00 11.00 6.00
## sym1 8 20 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00
## vermem2 9 20 74.05 9.86 74.00 73.88 11.86 59.00 91.00 32.00
## vismem2 10 20 69.20 8.38 69.50 69.62 10.38 54.00 80.00 26.00
## vms2 11 20 38.27 10.01 35.15 37.32 7.73 25.70 60.77 35.07
## rt2 12 20 0.78 0.23 0.70 0.74 0.11 0.51 1.30 0.79
## ic2 13 20 5.00 2.53 5.00 4.88 2.97 1.00 11.00 10.00
## sym2 14 20 27.65 9.07 27.00 27.75 11.12 13.00 43.00 30.00
## skew kurtosis se
## subject 0.00 -1.38 1.32
## condition* NaN NaN 0.00
## vermem1 -0.79 -0.70 1.60
## vismem1 0.45 -0.72 1.80
## vms1 -0.13 -0.78 0.81
## rt1 1.38 2.41 0.04
## ic1 -0.39 -0.81 0.37
## sym1 3.82 13.29 0.05
## vermem2 0.07 -1.24 2.21
## vismem2 -0.27 -1.26 1.87
## vms2 0.77 -0.57 2.24
## rt2 1.09 -0.10 0.05
## ic2 0.39 -0.28 0.57
## sym2 -0.11 -1.25 2.03
## --------------------------------------------------------
## group: control
## vars n mean sd median trimmed mad min max range skew
## subject 1 20 10.50 5.92 10.50 10.50 7.41 1.00 20.00 19.00 0.00
## condition* 2 20 2.00 0.00 2.00 2.00 0.00 2.00 2.00 0.00 NaN
## vermem1 3 20 89.85 5.82 90.00 90.31 7.41 78.00 98.00 20.00 -0.41
## vismem1 4 20 75.00 9.34 77.00 75.50 9.64 59.00 88.00 29.00 -0.46
## vms1 5 20 34.86 4.09 34.39 34.85 4.92 27.36 41.87 14.51 0.09
## rt1 6 20 0.67 0.13 0.66 0.67 0.13 0.42 1.00 0.58 0.47
## ic1 7 20 8.00 2.41 7.50 8.12 2.22 2.00 12.00 10.00 -0.41
## sym1 8 20 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00 3.82
## vermem2 9 20 89.95 4.36 90.50 90.06 5.19 81.00 97.00 16.00 -0.25
## vismem2 10 20 74.60 7.76 74.50 75.00 8.15 60.00 86.00 26.00 -0.23
## vms2 11 20 33.40 6.44 34.54 33.52 6.30 20.15 44.28 24.13 -0.25
## rt2 12 20 0.57 0.16 0.56 0.57 0.13 0.19 0.90 0.71 -0.16
## ic2 13 20 8.50 2.31 9.00 8.69 1.48 3.00 12.00 9.00 -0.73
## sym2 14 20 0.10 0.31 0.00 0.00 0.00 0.00 1.00 1.00 2.47
## kurtosis se
## subject -1.38 1.32
## condition* NaN 0.00
## vermem1 -0.87 1.30
## vismem1 -1.27 2.09
## vms1 -1.19 0.91
## rt1 -0.02 0.03
## ic1 -0.17 0.54
## sym1 13.29 0.05
## vermem2 -1.02 0.97
## vismem2 -1.11 1.73
## vms2 -0.77 1.44
## rt2 0.06 0.04
## ic2 -0.32 0.52
## sym2 4.32 0.07
# Create 2 subsets: control and concussed
control <- subset(impact, condition == "control")
concussed <- subset(impact, condition == "concussed")
# Calculate correlation coefficients for each subset
controlcorr <- round(cor(control$vismem2, control$vermem2), 2)
concussedcorr <- round(cor(concussed$vismem2, concussed$vermem2), 2)
# Display all values at the same time
correlations <- cbind(entirecorr, controlcorr, concussedcorr)
correlations
## entirecorr controlcorr concussedcorr
## [1,] 0.45 0.37 0.35
Chapter 2 - Introduction to Linear Regression
Introduction to regression:
Regression equations and the R-squared value:
Multiple linear regression:
Example code includes:
# Look at the dataset. Note that the variables we are interested in are on the 9th to 14th columns
head(impact)
## subject condition vermem1 vismem1 vms1 rt1 ic1 sym1 vermem2 vismem2
## 1 1 control 95 88 35.29 0.42 11 0 97 86
## 2 2 control 90 82 31.47 0.63 7 0 86 80
## 3 3 control 87 77 30.87 0.56 8 0 90 79
## 4 4 control 84 72 41.87 0.66 7 0 85 70
## 5 5 control 92 77 33.28 0.56 7 1 87 77
## 6 6 control 89 79 40.73 0.81 6 0 91 85
## vms2 rt2 ic2 sym2
## 1 35.61 0.65 10 0
## 2 37.01 0.49 7 0
## 3 20.15 0.75 9 0
## 4 33.26 0.19 8 0
## 5 28.34 0.59 8 1
## 6 33.47 0.48 5 0
# Create a correlation matrix for the dataset
correlations <- cor(impact[, 9:14])
# Create the scatterplot matrix for the dataset
corrplot::corrplot(correlations)
# Calculate the required means, standard deviations and correlation coefficient
mean_sym2 <- mean(impact$sym2)
mean_ic2 <- mean(impact$ic2)
sd_sym2 <- sd(impact$sym2)
sd_ic2 <- sd(impact$ic2)
r <- cor(impact$ic2,impact$sym2)
# Calculate the slope
B_1 <- r * ( sd_sym2 )/( sd_ic2 )
# Calculate the intercept
B_0 <- mean_sym2 - B_1 * mean_ic2
# Plot of ic2 against sym2
plot(x=impact$ic2, y=impact$sym2, main = "Scatterplot", ylab = "Symptoms", xlab = "Impulse Control")
# Add the regression line
abline(B_0, B_1, col = "red")
# Construct the regression model
model_1 <- lm(impact$sym2 ~ impact$ic2)
# Look at the results of the regression by using the summary function
summary(model_1)
##
## Call:
## lm(formula = impact$sym2 ~ impact$ic2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.441 -8.983 -5.309 9.127 29.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.2945 5.5090 5.318 4.9e-06 ***
## impact$ic2 -2.2844 0.7483 -3.053 0.00413 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.91 on 38 degrees of freedom
## Multiple R-squared: 0.1969, Adjusted R-squared: 0.1758
## F-statistic: 9.319 on 1 and 38 DF, p-value: 0.004125
# Create a scatter plot of Impulse Control against Symptom Score
plot(impact$sym2 ~ impact$ic2, main = "Scatterplot", ylab = "Symptoms", xlab = "Impulse Control")
# Add a regression line
abline(model_1, col = "red")
# Multiple Regression
model_2 <- lm(impact$sym2 ~ impact$ic2 + impact$vermem2)
# Examine the results of the regression
summary(model_2)
##
## Call:
## lm(formula = impact$sym2 ~ impact$ic2 + impact$vermem2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.274 -8.031 -2.703 6.245 27.962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.7639 14.7765 5.398 4.1e-06 ***
## impact$ic2 -1.0711 0.7335 -1.460 0.152690
## impact$vermem2 -0.7154 0.1981 -3.611 0.000898 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.12 on 37 degrees of freedom
## Multiple R-squared: 0.4062, Adjusted R-squared: 0.3742
## F-statistic: 12.66 on 2 and 37 DF, p-value: 6.482e-05
# Extract the predicted values
predicted <- fitted(model_2)
# Plotting predicted scores against observed scores
plot(predicted ~ impact$sym2, main = "Scatterplot", xlab = "Observed Scores", ylab = "Predicted Scores")
abline(lm(predicted ~ impact$sym2), col = "green")
Chapter 3 - Linear Regression Models (cont)
Estimation of coefficients - key concept is to minimize the residuals (specifically, residuals-squared):
Estimation of standardized and unstandardized regression coefficients:
Assumptions of linear regression:
Anscombe’s quartet:
Example code includes:
# Create a linear regression with `ic2` and `vismem2` as regressors
model_1 <- lm(impact$sym2 ~ impact$ic2 + impact$vismem2)
# Extract the predicted values
predicted_1 <- fitted(model_1)
# Calculate the squared deviation of the predicted values from the observed values
deviation_1 <- (impact$sym2 - predicted_1) ** 2
# Sum the squared deviations
SSR_1 <- sum(deviation_1)
SSR_1
## [1] 7236.338
# Create a linear regression with `ic2` and `vermem2` as regressors
model_2 <- lm(impact$sym2 ~ impact$ic2 + impact$vermem2)
# Extract the predicted values
predicted_2 <- fitted(model_2)
# Calculate the squared deviation of the predicted values from the observed values
deviation_2 <- (impact$sym2 - predicted_2) ** 2
# Sum the squared deviations
SSR_2 <- sum(deviation_2)
SSR_2
## [1] 5435.454
# Create a standardized simple linear regression
model_1_z <- lm(scale(impact$sym2) ~ scale(impact$ic2))
#Look at the output of this regression model
summary(model_1_z)
##
## Call:
## lm(formula = scale(impact$sym2) ~ scale(impact$ic2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4648 -0.5863 -0.3465 0.5958 1.9383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.487e-16 1.435e-01 0.000 1.00000
## scale(impact$ic2) -4.438e-01 1.454e-01 -3.053 0.00413 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9078 on 38 degrees of freedom
## Multiple R-squared: 0.1969, Adjusted R-squared: 0.1758
## F-statistic: 9.319 on 1 and 38 DF, p-value: 0.004125
# Extract the R-Squared value for this regression
r_square_1 <- summary(model_1_z)$r.square
#Calculate the correlation coefficient
corr_coef_1 <- sqrt(r_square_1)
# Create a standardized multiple linear regression
model_2_z <- lm(scale(impact$sym2) ~ scale(impact$ic2) + scale(impact$vismem2))
# Look at the output of this regression model
summary(model_2_z)
##
## Call:
## lm(formula = scale(impact$sym2) ~ scale(impact$ic2) + scale(impact$vismem2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4349 -0.5949 -0.3174 0.5331 1.9646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.450e-16 1.443e-01 0.000 1.0000
## scale(impact$ic2) -4.101e-01 1.526e-01 -2.688 0.0107 *
## scale(impact$vismem2) -1.171e-01 1.526e-01 -0.767 0.4479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9128 on 37 degrees of freedom
## Multiple R-squared: 0.2095, Adjusted R-squared: 0.1668
## F-statistic: 4.904 on 2 and 37 DF, p-value: 0.01291
# Extract the R-Squared value for this regression
r_square_2 <- summary(model_2_z)$r.squared
# Calculate the correlation coefficient
corr_coef_2 <- sqrt(r_square_2)
# Extract the residuals from the model
residual <- resid(model_2)
# Draw a histogram of the residuals
hist(residual)
# Extract the predicted symptom scores from the model
predicted <- fitted(model_2)
# Plot the residuals against the predicted symptom scores
plot(residual ~ predicted, main = "Scatterplot", xlab="Model 2 Predicted Scores", ylab="Model 2 Residuals" )
abline(lm(residual ~ predicted), col="red")
Chapter 1 - Inferential Ideas
Variability in regression lines:
Research question - linear modeling for relationships between fat, carbohydrates, and calories in Starbucks food:
Variability of coefficients:
Example code includes:
# Load the mosaicData package and the RailTrail data
library(mosaicData)
data(RailTrail)
# Fit a linear model
ride_lm <- lm(volume ~ hightemp, data=RailTrail)
# View the summary of your model
summary(ride_lm)
##
## Call:
## lm(formula = volume ~ hightemp, data = RailTrail)
##
## Residuals:
## Min 1Q Median 3Q Max
## -254.562 -57.800 8.737 57.352 314.035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.079 59.395 -0.288 0.774
## hightemp 5.702 0.848 6.724 1.71e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104.2 on 88 degrees of freedom
## Multiple R-squared: 0.3394, Adjusted R-squared: 0.3319
## F-statistic: 45.21 on 1 and 88 DF, p-value: 1.705e-09
# Print the tidy model output
ride_lm %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -17.079281 59.3953040 -0.2875527 7.743652e-01
## 2 hightemp 5.701878 0.8480074 6.7238541 1.705138e-09
expData1 <- c(-4.3, 0.19, -2.59, -0.43, 0.59, -2.74, 3.09, 3.51, 0.56, 5.89, 0.36, -0.01, 2.59, 1.51, 2.89, -8.26, -0.46, 3.28, 4.85, 1.16, 3.03, 2.24, 1.78, -0.26, 4.29, 6.92, -6.34, 0.49, 3.4, 3.08, 2.1, -1.93, 3.72, 0.52, -4.65, 4.24, -1.21, 5.15, -10.43, 6.46, -2.78, 0.7, 2.93, -4.84, -7.08, -3.98, 8.27, -4.51, -5.22, -2.17, 2.32, 0.37, -2.53, 3.2, -8.02, -1.82, -6.17, 1.45, -0.19, -0.91, -2.02, 1.13, 11.2, 4.43, 0.88, -0.28, -9.29, 0.18, -6.9, 0.44, -9.1, -1.21, 11.32, -3.3, 3.56, 1.28, 5.76, -2.73, -9.69, -4.43, 5.71, 1.09, -8.28, -7.12, -0.33, -4.3, 4.16, 4.83, -0.29, -3.78, 5.03, 12.3, 4.79, 0.69, -11.06, 3.73, -6.64, -0.24, 5.08, -0.48, 0.68, 4.43, 2.11, 1.8, 2.98, -4.84, -3.9, 4.1, 0.05, -7.43, -2.41, 1.14, -1.87, 11.12, 6.26, 1.29, -4.54, 5.38, 3.09, -4.59, 8.55, -4.21, -0.92, 0.79, -3.48, -6.13, 3.58, 4.54, -4.83, -13.5, 1.58, -1.03, 1.34, -1.46, 5.53, -4.23, -6.95, 6.17, -0.89, 9.95, -4.12, 0.08, 2.49, -8.42, -2.4, -6.96, 7.92, -5.04, -0.25, -0.63, 8.4, 4.18, -4.86, 0.99, -5.54, -4.23, -2.23, 2.21, -0.05, -2.67, -1.14, 3.3, -5.48, 3.86, 2.1, 4.81, -1.09, -10.97, -16.68, -8.58, 3.78, 5.94, 0.35, 0.14, -8.6, -3.44, -5.14, -6.65, -0.49, -1.99, 3.54, 4.7, -0.61, 8.69, 0.91, 0.71, 3.6, -3.1, -2.99, 5.82, 3.84, 0.82, -2.74, -6.27, -3.03, 1.29, 1.58, 1.76, 4.64, -7.24, 1.54, 0.83, -0.6, -0.29, 0.78, -8.42, 9.76, 14.35, -1.09, -13.42, -1.72, 4.49, -0.02, -0.47, 8.93, 5.27, -6.06, 12.66, 0.53, -3.08, 0.52, -0.71, -0.39, -1.11, -1.72, 8.66, -1.41, 2.77, 1.03, -6.97, 7.57, -10.75, -0.88, -2.53, 1.64, 6.48, -1.61, -1.98, -5.91, 7.25, -1.67, 4.26, -7.22, 6.03, 2.92, 4.08, 9.65, -12.34, 1.24, 3.76, 3.25, -9.13, -3.23, 0.51, -1.52, -3.44, 6.75, -0.18, -3.92, -4.14, 1.14, 6.44, -0.32, 5.91, -3.55, -8.99, -6.38, -2.64, -1.47, -3.91, 12.07, 5.55, -7.94, 10.98, -6.57, -3.43, -1.13, 9.51, 11.19, -3.21, -3.19, -7.94, 2.4)
expData2 <- c(-4.59, 6.5, 3.8, -6.42, 0.78, -2.4, -2.55, -3.2, -6.3, 4.69, -0.05, 5.71, 6.5, 3.69, -4.75, 4.87, -2.42, -5.04, 3.75, 1.69, -0.19, 8.33, 2.8, -0.09, 6.24, -3.73, -2.64, 8.11, -4.43, 4.42, 3.46, -6.71, -5.47, 6.84, 4.94, 2.23, 0.92, 1.56, -3.52, -5.42, -1.04, -4.33, -0.63, -1.72, -5.42, -8.92, -4.8, -6.53, 3.33, 3.39, 4.08, -3.03, -5.11, 7.04, -0.93, -2.56, -1.45, 8.75, -4.01, -5.87, 3.36, 5.83, 1.13, -1.25, -0.04, 0.23, 0.95, 3.16, -7.17, 12.37, -9.98, -9.73, 1.55, -8.56, 13.58, 0.56, 6.39, 2.34, -5.11, 6.48, -1.62, -1.16, -6.37, 7.48, 3.51, 4.82, 1.73, -0.48, -0.84, 2.58, -3.24, -1.33, 4.69, -0.99, 9.78, -16.75, -2.92, 10.15, -4.64, 5.66, 0.89, 2.11, 1.66, 3.78, 3.43, -1.09, -1.43, -10.07, -0.87, 4.41, -3.55, -1.66, 8.28, 8.3, 1.03, 6.42, -0.33, -2.63, -4.12, 6.68, -1.32, 10.69, 7.11, -3.75, 1.16, 5.19, -4.41, -4.13, -3.32, -8.24, -3.19, 1.1, 5.45, 2.19, -10.27, -0.87, -1.32, -2.77, 7.39, -14.48, -2.06, -3.46, -4.21, -6.55, -1.59, -0.44, -3.11, -4.21, -8.38, 0.01, 10.58, 3.05, 3.67, -2.52, 2.05, -2, 7.04, -0.42, -12.23, -0.44, -1.66, -1.31, -0.16, 1.72, -3.25, 2.56, -0.21, -1.59, 2.35, -2.5, 0.44, 8.61, 2.83, 10.75, 1.1, -0.89, 4.89, -0.91, 1.83, 3.2, -1.16, -3.23, 0.96, 2.59, 6.36, -0.53, -4.2, -1.13, 2.37, -1.06, -3.69, -0.25, 8.21, -5.84, -5.53, -3.03, -0.79, -0.72, -0.67, 3.23, -6.51, 2.06, -0.4, 0.75, -2.39, -2.27, -3.65, -7.56, 3.24, -4.05, -4.2, -5.91, 5.24, -11.65, -4.16, -5.99, 1.22, 1.32, -3.63, -0.9, -3.52, -5.25, 8.05, 4.09, -3.22, 5.71, 0.67, -5.46, -5.24, -1.7, -6.4, 0.48, 4.49, 15.97, -1.42, 2.41, -1.75, 4.77, -4.45, 0.88, 0.24, 11.64, -0.51, 1.58, 4.18, -3.51, 2.32, -2.15, -5.42, 5.6, 4.18, -4.82, -1.41, -5.32, 0.58, 1.23, -5.35, -5.88, 0.76, -2.81, 0.59, -2.26, 4.05, 0.32, 5.97, 4.22, -1.79, 3.28, -4.16, -4.88, -1.24, -7.38, -2.67, -4.56, 2.45, 4.92, 1.84, -1.6, 4.79, -4.02, -9.2, 6.78, -8.21, -0.18, -4.02, 4.84, 2.81, -2.65, -4.72, -0.83, -4.69, 7.94, 3.53, 4.25, 5.06, 7.88, -1.08, -0.78, 3.41, -10.45, 0.16, 0.13, -0.6, 1.82, 5.68, 5.7, 4.66, -5.4, 7.12, -2.49, 1.5, 1.27, -8.26, 0.58, 0.04, 3.17, -3.23, -0.66, -3.2, 1.59, -4, -1.96, -3.48, -3.4, -3.95, 4.52, 2.5, -3.37, -14.81, -3.22, -3.57, 2.44, 0.17, 4.8, -6.15, -3.4, -4.1, -2.68, 5.86, 2.92, -0.19, -4.64, 9.4, 6.49, -5.84, -6.62, 2.86, -3.56, -4.6, -4.87, 7.32, 3.82, 8.99, -1.46, 4.98, -1.41, -5.89, -8.86, 6.87, -7.25, 2.67, 2.81, -0.22, -3.37, 6.74, 3.33, 4.72, 1.02, -3.02, -4.9, 2, 3.41, 0.5, -7.36, 6.36, 4, 2.24, -6.75, -5.62, -8.14, 1.82, 6.23, -0.18, 10.71, -0.57, 1.38, 9.5, 1.12, 3.08, 0.08, -4.75, 4.23, -2.23, -0.82, 1.84, -1.15, 4.12, -5.86, -0.16, -6.5, 4.86)
expData3 <- c(-0.62, 1.5, 5.44, -1.68, -10.04, 11.49, 1.48, -1.82, 1.57, 3.06, -2.36, -7.98, 0.25, -1.77, 3.32, 1.72, -7.55, 7.24, 2.78, -4.41, -2.55, -1.3, -1.49, 2.78, -4.37, -4.41, 0.57, 0.7, -0.56, 0.17, -2.52, 0.5, 2.46, -5.55, 2.98, 0.51, -0.28, 3.97, 6.74, 0.14, 3.54, 0.38, -2.69, 1.59, 3.09, -2.73, 4.93, 7.43, 1.76, 0.77, 4.54, 3.69, 5.75, -2.68, -1.01, 6.47, 1.91, -3.48, -2.91, 3.62, -3.72, 2.09, 0.63, -6.95, -0.66, -8.25, 6.6, -3.02, 3.51, 11.77, -1.78, -1.57, 5.58, -0.44, 3.07, -2.54, -3.1, 3.77, 8.05, -2.44, -0.95, 3.73, 1.64, 7.64, 3.63, 3.39, 1.71, -6.25, -3.47, 1.6, 3.49, 0.94, 0.18, -4.29, -2.62, 14.57, -1.73, 1.79, 2.54, -2.94, -0.56, 6.87, -4.81, 6.45, 4.2, 1.65, 8.4, 7.45, 7.11, 5.56, 1.06, -8.52, -7.68, -6.63, -4.09, 0.16, -6.08, -5.78, -4.46, -1.35, 3.34, -0.51, -3.65, -3.82, 0.64, 8.2, 14.07, -0.87, 3.3, 1.7, -3.17, -0.57, -1.06, 5.74, 0.79, -5.42, -2.22, 3.72, 2.88, -5.73, 0.82, -3.04, 6.11, 7.04, 2.84, 0.29, -2.37, 4.49, -5, -4.09, 0.33, 0.34, 0.81, 2.11, -1.55, -0.75, -7.49, 6.03, 0.14, 3.58, -0.67, 7.74, 5.55, 5.44, -8.21, 8.48, 2.15, 0.04, -3.68, 6.09, 4.06, 2.85, 2.47, -1.37, 3.66, 0.63, 0.46, -1.82, -7.6, 0.05, -3.03, -7.56, 1.56, 2.44, -2.56, -9.01, -0.19, -5.88, -7.51, -5.84, 3.79, -18, 5.33, -4.15, -6.26, 0.53, 15.21, 4.85, 1.98, -1.25, -1.12, -5.65, -0.96, 11.19, 2.76, -2.89, 0.49, -1.83, -2.52, -1.03, -1.54, -1.22, 4.27, -2.39, 0, -3.61, 0.93, -8.6, -4.41, 5.23, -3.77, 0.99, -6.99, -1.57, 3, -0.47, -3.44, 10.14, 2.8, 1.28, -0.16, 8, 4.47, 1.46, 0.86, -3.14, 1.47, 2.22, -0.05, -1.66, 3.6, -2.25, -5.84, 5.91, 2.39, 4.85, 5.07, -2.37, 0.86, -4.37, -3.32, 2.24, -3.78, 1.35, 0.01, -1.53, -3.88, 2.32, -4.27, -1.08, 4.45, 3.55, 1.82, 11.33, 1.49, -1.67, 0.49, -3.35, 0.26, -2.57, -2.51, -13.35, 6.11, 8.47, 3.94, -4.56, -3.28, 3.92, 5.81, -3.57, -1.75, -4.77, 4, -3.46, -2.25, -0.94, -4.16, -11.13, 5.81, -3.29, 5.69, 10.75, 4.29, -0.21, -0.38, 6.03, -1.97, -4.57, 7.61, -5.07, 3.82, 1.73, 8.15, 5.79, 0.19, 1.28, 3.23, -5.88, -10.91, 9.61, -0.47, 6.15, -6.18, -0.29, 1.76, 0.34)
respData1 <- c(27.8, 39.19, 39.31, 42.6, 46.38, 40.21, 44.47, 46.24, 34.38, 69.78, 47.47, 53.41, 52.07, 47.09, 49.82, 13.05, 37.2, 54.27, 42.01, 31.94, 46.56, 10.89, 34.58, 44.43, 51.34, 44.57, 23.28, 46.32, 38.84, 50.78, 34.92, 35.1, 59.31, 40.65, 26.79, 37.85, 45.41, 52.6, 19.58, 36.63, 17.9, 63.94, 51.59, 19.05, 14.15, 37.03, 66.97, 15.58, 29.71, 43.78, 47.02, 37.27, 30.03, 43.11, 36.41, 32.44, 42.13, 46.72, 39.5, 32.4, 45.52, 26.89, 72.18, 51.75, 46.9, 41.5, 22.07, 46.82, 17.87, 50.12, 18.44, 28.21, 68.83, 24.07, 49.43, 43.31, 53.94, 26.36, 7.55, 17.13, 64.75, 36.93, 8.65, 21.06, 44.15, 40.35, 27.84, 42.75, 38.86, 21.84, 52.34, 63.13, 43.23, 38.48, 25.56, 37.81, 19.7, 32.33, 51.69, 40.01, 35.01, 60.59, 47.98, 32.92, 62.64, 15.48, 28.79, 46.04, 60.79, 32.6, 55.21, 41.05, 33.99, 58.24, 50.12, 43.4, 38.2, 58.34, 40.5, 25.68, 60.69, 44.46, 25.28, 40.56, 42.48, 32.15, 52.42, 56.78, 31.09, 18.29, 53.15, 30.62, 43.09, 35.78, 56.31, 20.42, 23.26, 48.99, 26.23, 61, 41.67, 41.04, 11.61, 41.64, 50.24, 18.98, 48.7, 17.97, 38, 50.85, 63.39, 57.49, 19.51, 54.11, 18.01, 33.74, 19.89, 44.66, 23.09, 42.45, 47.84, 39.38, 26.44, 25.24, 46.74, 33.03, 35.28, 35.73, -2.21, 20.49, 54.54, 50.42, 34.82, 47.67, 13.75, 44.62, 33.73, 31.53, 42.63, 36.64, 55.48, 49.84, 41.98, 69.24, 48.39, 39.12, 40.55, 41.95, 29.31, 34.22, 32.13, 33.6, 14.66, 23.75, 31.9, 35.76, 29, 50.02, 51.85, 13, 43.69, 45.67, 39.06, 43.92, 47.04, 11.32, 66.35, 56.47, 46.27, -0.56, 58.99, 57.85, 50.48, 24.43, 62.28, 49.07, 29.16, 63.71, 35.43, 25.9, 27.7, 40.02, 36.33, 43.11, 28.98, 51.88, 45.73, 48.29, 44.55, 28.76, 60.29, 14.26, 38.09, 43.13, 47.68, 60.55, 47.78, 29.85, 26.43, 62.71, 31.78, 38.87, 28.99, 56.19, 17.08, 44.7, 51.59, 9.56, 39.35, 48.91, 35.22, 26.53, 36.73, 43.78, 50.2, 32.55, 53.92, 33.67, 32.58, 34.44, 41.82, 51.16, 21.73, 53.09, 35.07, 6.84, 30.26, 33.74, 54.12, 32.41, 57.36, 52.16, 22, 64.32, 42.23, 51.91, 44.38, 47.45, 57.47, 48.04, 28.02, 21.08, 52.87, 30.48, 52.76, 51.07, 21.07, 37.38, 27.2, 35.8, 35.36, 50.08, 61.72, 27.94, 54.76, 57.88, 50.57, 38.38, 58.27, 13.5, 26.88, 33.78, 67.2, 31.6, 43.14, 43.19, 43.39, 43.09, 30.85, 34.1, 72.42, 15.42, 66.27, 43.71, 28.42, 13.3, 46.63, 35.42, 52.04, 55.89, 49.6, 44.02, 24.67, 46.79, 37.98, 39.42, 23.08, 26.36, 30.27, 57.94, 22.79, 60.07, 36.51, 49.22, 22.53, 29.96, 62.01, 42.3, 30.05, 55.57, 51.68, 23.05, 28.9, 47.02, 63.76, 42.48, 56.21, 44.29, 28.75, 43.53, 29.18, 14.44, 67.68, 16.92, -1, 51.54, 24.68, 77.5, 72.04, 51.44, 59.89, 5.48, 65.2, 35.03, 29.17, 25.99, 65.76, 54.67, 51.44, 51.34, 25.71, 25.48, 45.49, 37.31, 25.55, 59.4, 23.38, 46.47, 5.27, 48.51, 59.98, 34.85, 48.4, 62.56, 27.1, 41.17, 60.38, 57.21, 17.7, 39.84, 9.25, 39.82, 60.86, 53.29, 33.74, 66.61, 66.06, 50.5, 67.98, 21.79, 25.02, 48.24, 69.45, 35.39, 67.24, 53.73, 25.21, 43.4, 50.39, 30.88, 44.33, 6.28)
respData2 <- c(18.26, 36.11, 25.88, 46.64, 38.22, -2.89, 24.17, 32.85, 37.09, 52.34, 12.35, 50.37, 31.17, 22.07, 42.49, 51.39, 34.57, 25.83, 28.45, 37.21, 41.36, 71.02, 46.11, 39.2, 36.16, 41.46, 40.83, 59.73, 30.75, 1.55, 26.67, 49.85, 35.61, 50.58, 39.23, 40.37, 45.63, 35.29, 46.06, 44.51, 36.47, 51.52, 46.69, 36.55, 53.82, 66.62, 47.33, 54.24, 27.13, 45.48, 53.06, 21.14, 52.55, 51.62, 47.59, 40.92, 27.66, 34.75, 30.73, 61.73, 36.3, 48.03, 34.31, 61.52, 27.52, 32.32, 45.33, 31.56, 32.85, 33.09, 44.52, 15.36, 48.63, 37.8, 45.67, 42.7, 40.2, 13.56, 29.57, 48.77, 29.23, 34.8, 26.91, 50.36, 29.04, 27.91, 0.98, 44.37, 29.11, 36.25, 37.73, 36.64, 18.95, 41.73, 54.34, 34.53, 56.36, 44.06, 39.32, 21.44, 53.32, 27.81, 49.95, 49.67, 68.74, 31.31, 57.06, 33.94, 39.64, 7.46, 52.8, 34.4, 70.42, 32.35, 49.45, 47.13, 34.96, 42.26, 38.28, 28.98, 37.15, 49.47, 29.23, 31.53, 28.17, 35.08, 32.34, 33.13, 33.98, 31.66, 29.63, 38.07, 49.42, 48.03, 45.81, 42.76, 63.01, 31.79, 36.26, 35.28, 34.19, 24.96, 9.13, 30.69, 36.83, 22.96, 52.03, 52.85, 49.96, 54.53, 31.88, 14.11, 51.02, 28.36, 40.92, 53.8, 63.55, 49.42, 16.49, 26.25, 34.56, 34.24, 29.5, 56.65, 33.47, 57.91, 54.78, 40.52, 41.14, 43.87, 28.43, 25.15, 38.2, 52.35, 40.83, 58.92, 37.48, 50.09, 33.76, 46.91, 30.51, 52.1, 45.28, 25.65, 28.95, 43.69, 49.32, 32.96, 34.64, 45.21, 30.77, 43.83, 45.89, 27.21, 38.51, 23.67, 37.26, 49.04, 42.06, 7.7, 36.93, 20.52, 29.9, 30.13, 55.83, 18.76, 35.06, 36.68, 34.92, 59.64, 41.81, 22.45, 28.44, 77.59, 59.44, 19.26, 34.14, 63.37, 24.93, 24.94, 16.79, 38.96, 34.77, 55, 43.88, 43.47, 29, 38.99, 20.2, 59.86, 42.71, 52.67, 47.27, 42.85, 15.67, 54, 68.67)
respData3 <- c(52.15, 49.27, 49.14, 40.6, 50.46, 58.2, 45.66, 20.46, 49.97, 42.83, 25.73, 28.75, 27.69, 22.29, 53.53, 52.18, 41.86, 69.86, 37.78, 31.75, 46.9, 32.98, 52.79, 35.54, 34.51, 26.63, 39.69, 27, 47.79, 19.5, 45.63, 26.92, 44.14, 16.93, 42.78, 56.02, 47.01, 46.89, 19.48, 25.95, 61.74, 47.83, 45.1, 41.11, 64.28, 27.7, 8.05, 49.37, 43.05, 49.23, 33.99, 25.97, 60.66, 44.42, 37.06, 40.95, 21.97, 18.88, 34.68, 41.47, 35.65, 36.49, 31.45, 36.02, 38.67, 28.87, 47.33, 48.99, 39.26, 59.34, 57.07, 39.02, 35.12, 50.94, 48.7, 59.07, 36.1, 41.1, 48.53, 36.84, 20.57, 67.63, 52.86, 28.83, 47.47, 43.59, 63.2, 59.74, 23.63, 44.15, 49.71, 41.84, 28.01, 24.4, 36.04, 46.16, 39.15, 50.76, 27.61, 11.67, 27.04, 39.95, 19.58, 31.08, 70.48, 41.18, 21.73, 51.49, 45.3, 61.05, 18.11, 26.11, 41.49, 51.15, 45.32, 35.03, 45.41, 48.5, 41.7, 56.59, 44.21, 51.48, 22.45, 38.89, 42.35, 47.44, 44.4, 47.43, 38.98, 15.87, 73.93, 57.64, 55.71, 56.87, 40.79, 35.31, 56.81, 31.64, 51.35, 40.61, 44.84, 63.09, 56.03, 36.39, 54.85, 35.81, 5.16, 25.41, 26.6, 39.91, 18.4, 30.47, 28.81, 35.49, 21.98, 54.58, 37.43, 45.14, 26.94, 42.56, 61.86, 63.86, 22.83, 41.93, 43.3, 35.72, 30.3, 23.57, 62.44, 28.81, 45.11, 38.81, 37.09, 35.89, 28.37, 25.34, 20.95, 37.1, 50.87, 44.03, 25.36, 46.32, 40.63, 13.77, 33.89, 35.41, 42.71, 57.71, 46.37, 48.02, 42.58, 20.81, 64.81, 42.34, 44.93, 24.07, 71.44, 49.04, 65.73, 30.5, 56.97, 77.54, 37.53, 49.04, 52.37, 44.21, 49.64, 41.62, 33.15, 26.12, 59.01, 50.39, 12.17, 26.18, 51.92, 35.1, 45.64, 62.54, 47.18, 23.61, 13.8, 48.7, 22.64, 23.25, 26.07, 65.36, -3.6, 44.56, 20.4, 28.77, 44.92, 52.98, 38.6, 46.51, 37.8, 41.54, 14.45, 40.69, 61.11, 55.09, 30.34, 39.57, 32.23, 48.52, 44.47, 27.24, 40.09, 48.87, 31.78, 44.45, 50.95, 36.98, 24.06, 22, 39.89, 33.79, 52.91, 18.36, 32.73, 65.36, 39.55, 36.56, 61.26, 70.61, 48.07, 41.77, 83.77, 62.77, 36.37, 38.26, 31.61, 46.1, 58.54, 24.53, 39.71, 58.49, 36, 28.21, 41.23, 31.57, 31.77, 42.54, 22.47, 48.5, 46.3, 30.97, 53.55, 23.35, 60.6, 41.07, 46.19, 22.14, 52.2, 29.76, 42.34, 43.52, 38.48, 49.56, 59.15, 29.13, 20.9, 40.13, 25.61, 56.45, 35.77, 34.6, 27.61, 37.08, 42.26, 36.76, 23.48, 27.94, 43.68, 49.03, 54.34, 57.83, 45.74, 54, 41.25, 36.96, 56.99, 25.3, 37.1, 54.32, 39.65, 59.93, 53.08, 32.52, 25.58, 34.33, 50.6, 49.97, 23.38, 66.08, 39.08, 47.11, 54.24, 55.54, 45.4, 44.52, 36.92, 45.72, 29.69, 13.78, 41.59, 27.96, 40.98, 19.86, 37.85, 23.43, 41.28)
popdata <- data.frame(explanatory=c(expData1, expData2, expData3),
response=c(respData1, respData2, respData3)
)
str(popdata)
## 'data.frame': 1000 obs. of 2 variables:
## $ explanatory: num -4.3 0.19 -2.59 -0.43 0.59 -2.74 3.09 3.51 0.56 5.89 ...
## $ response : num 27.8 39.2 39.3 42.6 46.4 ...
# Plot the whole dataset
ggplot(popdata, aes(x = explanatory, y = response)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Take 2 samples of size 50
set.seed(4747)
sample1 <- popdata %>% sample_n(50)
sample2 <- popdata %>% sample_n(50)
# Plot sample1
plot1 <- ggplot(sample1, aes(x = explanatory, y = response)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "blue")
plot1
# Plot sample2 over sample1
plot1 + geom_point(data = sample2,
aes(x = explanatory, y = response),
color = "red") +
geom_smooth(data = sample2,
aes(x = explanatory, y = response),
method = "lm",
se = FALSE,
color = "red")
# Repeatedly sample the population
manysamples <- infer::rep_sample_n(popdata, size=50, reps=100)
# Plot the regression lines
ggplot(manysamples, aes(x=explanatory, y=response, group=replicate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Fit and tidy many linear models
manylms <- manysamples %>%
group_by(replicate) %>%
do(lm(response ~ explanatory, data=.) %>%
broom::tidy()) %>%
filter(term=="explanatory")
# Plot a histogram of the slope coefficients
ggplot(manylms, aes(x=estimate)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Take 100 samples of size 50
manysamples1 <- infer::rep_sample_n(popdata, size=50, reps=100)
# Plot the regression line for each sample
ggplot(manysamples1, aes(x=explanatory, y=response, group=replicate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Take 100 samples of size 10
manysamples2 <- infer::rep_sample_n(popdata, size=10, reps=100)
# Plot the regression line for each sample
ggplot(manysamples2, aes(x=explanatory, y=response, group=replicate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# In order to understand the sampling distribution associated with the slope coefficient, it is valuable to visualize the impact changes in the sample and population have on the slope coefficient. Here, reducing the variance associated with the response variable around the line changes the variability associated with the slope statistics.
# The new popdata is already loaded in your workspace.
# Take 100 samples of size 50
oldPopData <- popdata
popdata$response <- (oldPopData$response - mean(oldPopData$response)) / sd(oldPopData$response)
popdata$response <- 40 + popdata$response * 11.152
manysamples <- infer::rep_sample_n(popdata, size=50, reps=100)
# Plot a regression line for each sample
ggplot(manysamples, aes(x=explanatory, y=response, group=replicate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
Chapter 2 - Simulation Based Inference for Slope Parameters
Simulation-based inference - using the twins study from the 1920s (one twin was raise by their parents and the other in a foster home):
Simulation-based inference for slope - can also be calculated using bootstrap for CI (as opposed to testing a null-hypothesis):
Example code includes:
# Load the infer package
library(infer)
twins <- readr::read_csv("./RInputFiles/twins.csv")
## Parsed with column specification:
## cols(
## Foster = col_integer(),
## Biological = col_integer(),
## Social = col_character()
## )
str(twins)
## Classes 'tbl_df', 'tbl' and 'data.frame': 27 obs. of 3 variables:
## $ Foster : int 82 80 88 108 116 117 132 71 75 93 ...
## $ Biological: int 82 90 91 115 115 129 131 78 79 82 ...
## $ Social : chr "high" "high" "high" "high" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 3
## .. ..$ Foster : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Biological: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Social : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Calculate the observed slope
obs_slope <- lm(Foster ~ Biological, data=twins) %>%
broom::tidy() %>%
filter(term == "Biological") %>%
pull(estimate)
# Simulate 10 slopes with a permuted dataset
set.seed(4747)
perm_slope <- twins %>%
specify(Foster ~ Biological) %>%
hypothesize(null = "independence") %>%
generate(reps = 10, type = "permute") %>%
calculate(stat = "slope")
# Print the observed slope and the 10 permuted slopes
obs_slope
## [1] 0.901436
perm_slope
## # A tibble: 10 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.143
## 2 2 0.0710
## 3 3 -0.456
## 4 4 0.0749
## 5 5 0.297
## 6 6 0.0673
## 7 7 0.140
## 8 8 0.164
## 9 9 0.0971
## 10 10 0.184
# Make a dataframe with replicates and plot them!
set.seed(4747)
perm_slope <- twins %>%
specify(Foster ~ Biological) %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "slope")
ggplot(perm_slope, aes(x=stat)) +
geom_density()
# Calculate the mean and the standard deviation of the slopes
mean(perm_slope$stat)
## [1] 0.006285095
sd(perm_slope$stat)
## [1] 0.1963073
# Calculate the absolute value of the slope
abs_obs_slope <- lm(Foster ~ Biological, data=twins) %>%
broom::tidy() %>%
filter(term == "Biological") %>%
pull(estimate) %>%
abs()
# Compute the p-value
perm_slope %>%
mutate(abs_perm_slope=abs(stat)) %>%
summarize(p_value = mean(abs_perm_slope > abs_obs_slope))
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
# Calculate 1000 bootstrapped slopes
set.seed(4747)
BS_slope <- twins %>%
specify(Foster ~ Biological) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
# Look at the head of BS_slope
head(BS_slope)
## # A tibble: 6 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.946
## 2 2 0.966
## 3 3 0.870
## 4 4 0.930
## 5 5 0.807
## 6 6 0.900
# Create a confidence interval
BS_slope %>%
summarize(lower = mean(stat) - 2 *sd(stat),
upper = mean(stat) + 2 *sd(stat))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.719 1.08
# Set alpha
alpha <- 0.05
# Create a confidence interval
BS_slope %>%
summarize(low = quantile(stat, alpha/2),
high = quantile(stat, 1 - alpha/2))
## # A tibble: 1 x 2
## low high
## <dbl> <dbl>
## 1 0.724 1.08
Chapter 3 - t-Based Inference for the Slope Parameter
Mathematical approximation for testing and estimating slope parameters (based on the t-distribution):
Intervals in regression - estimating the coefficients by way of confidence intervals (CI):
Different types of intervals - often of interest to know the variability in the predicted value, not just the parameter estimates:
Example code includes:
# twins_perm <- twins %>%
# specify(Foster ~ Biological) %>%
# hypothesize(null="independence") %>%
# generate(reps = 10, type = "permute") %>%
# calculate(stat = "slope")
# The randomized slopes are given in the twins_perm dataframe
# Look at the head of the data
# head(twins_perm)
# Plot the histogram with the t distribution
# twins_perm %>%
# filter(term == "Biological_perm") %>%
# ggplot(aes(x=statistic)) +
# geom_histogram(aes(y = ..density..), bins = 50) +
# stat_function(fun = dt, color = "red", args = list(df=nrow(twins)-2))
# Tidy the model
lm(Foster ~ Biological, data=twins) %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 9.207599 9.29989643 0.9900754 3.316237e-01
## 2 Biological 0.901436 0.09633286 9.3575128 1.203600e-09
# Create a one-sided p-value
lm(Foster ~ Biological, data=twins) %>%
broom::tidy() %>%
filter(term == "Biological") %>%
select(p.value) %>%
mutate(p_value_1side = p.value/2)
## p.value p_value_1side
## 1 1.2036e-09 6.018e-10
# Test the new hypothesis
lm(Foster ~ Biological, data = twins) %>%
broom::tidy() %>%
filter(term == "Biological") %>%
mutate(statistic_test1 = (estimate - 1) / std.error,
p_value_test1 = 2 * pt(abs(statistic_test1), df=nrow(twins)-2, lower.tail=FALSE))
## term estimate std.error statistic p.value statistic_test1
## 1 Biological 0.901436 0.09633286 9.357513 1.2036e-09 -1.023161
## p_value_test1
## 1 0.3160311
# Find the p-value
# perm_slope %>%
# mutate(abs_perm_slope = abs(stat)) %>%
# summarize(p_value = mean(abs_perm_slope > abs(obs_slope)))
# Set alpha
alpha <- 0.05
# Find the critical value
crit_val <- qt(0.975, df = nrow(twins)-2)
# Tidy the model with the confidence level alpha
lm(Foster ~ Biological, data=twins) %>%
broom::tidy(conf.int=TRUE, conf.level=1-alpha)
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 9.207599 9.29989643 0.9900754 3.316237e-01 -9.9458964
## 2 Biological 0.901436 0.09633286 9.3575128 1.203600e-09 0.7030348
## conf.high
## 1 28.361094
## 2 1.099837
# Find the lower and upper bounds of the confidence interval
lm(Foster ~ Biological, data=twins) %>%
broom::tidy() %>%
mutate(lower = estimate - crit_val * std.error,
upper = estimate + crit_val * std.error)
## term estimate std.error statistic p.value lower
## 1 (Intercept) 9.207599 9.29989643 0.9900754 3.316237e-01 -9.9458964
## 2 Biological 0.901436 0.09633286 9.3575128 1.203600e-09 0.7030348
## upper
## 1 28.361094
## 2 1.099837
# Create the bootstrap confidence interval
BS_slope %>%
summarize(low = quantile(stat, alpha/2),
high = quantile(stat, 1 - alpha/2))
## # A tibble: 1 x 2
## low high
## <dbl> <dbl>
## 1 0.724 1.08
# Set alpha
alpha <- 0.05
# Find the critical value
crit_val <- qt(1-alpha/2, nrow(twins)-2)
# Create a dataframe of new observations
newtwins <- data.frame(Biological = c(80, 90, 100, 110))
# Find prediction intervals
lm(Foster ~ Biological, data=twins) %>%
broom::augment(newdata = newtwins) %>%
mutate(lowMean = .fitted - crit_val * .se.fit,
upMean = .fitted + crit_val * .se.fit)
## Biological .fitted .se.fit lowMean upMean
## 1 80 81.32248 2.093789 77.01024 85.63472
## 2 90 90.33684 1.572563 87.09809 93.57559
## 3 100 99.35120 1.554979 96.14866 102.55374
## 4 110 108.36556 2.054014 104.13524 112.59588
# Set alpha and find the critical value
alpha <- 0.05
crit_val <- qt(1-alpha/2, df=nrow(twins)-2)
# Find confidence intervals for the response
predMeans <- lm(Foster ~ Biological, data=twins) %>%
broom::augment() %>%
mutate(lowMean = .fitted - crit_val*.se.fit,
upMean = .fitted + crit_val*.se.fit)
# Examine the intervals
head(predMeans)
## Foster Biological .fitted .se.fit .resid .hat .sigma
## 1 82 82 83.12535 1.962975 -1.125351 0.06449924 7.885059
## 2 80 90 90.33684 1.572563 -10.336839 0.04139435 7.588558
## 3 88 91 91.23827 1.543999 -3.238275 0.03990427 7.859737
## 4 108 115 112.87274 2.411531 -4.872739 0.09734432 7.818859
## 5 116 115 112.87274 2.411531 3.127261 0.09734432 7.859970
## 6 117 129 125.49284 3.571300 -8.492843 0.21349013 7.642607
## .cooksd .std.resid lowMean upMean
## 1 0.0007811552 -0.1505319 79.08253 87.16817
## 2 0.0402839133 -1.3659358 87.09809 93.57559
## 3 0.0037993804 -0.4275816 88.05835 94.41820
## 4 0.0237414774 -0.6635515 107.90610 117.83938
## 5 0.0097788932 0.4258588 107.90610 117.83938
## 6 0.2083390637 -1.2389779 118.13761 132.84807
# Plot the data with geom_ribbon()
ggplot(predMeans, aes(x=Biological, y=Foster)) +
geom_point() +
stat_smooth(method="lm", se=FALSE) +
geom_ribbon(aes(ymin = lowMean, ymax = upMean), alpha=.2)
# Plot the data with stat_smooth()
ggplot(twins, aes(x = Biological, y = Foster)) +
geom_point() +
stat_smooth(method="lm", se=TRUE)
# Set alpha and find the critical value
alpha <- 0.05
crit_val <- qt(1-alpha/2, nrow(twins)-2)
# Fit a model and use glance to find sigma
twin_lm <- lm(Foster ~ Biological, data=twins)
twin_gl <- broom::glance(twin_lm)
# Pull sigma
twin_sig <- pull(twin_gl, sigma)
# Augment the model to find the prediction standard errors
twin_pred <- broom::augment(twin_lm) %>%
mutate(.se.pred = sqrt(twin_sig ** 2 + .se.fit ** 2))
# Create prediction intervals
predResp <- twin_pred %>%
mutate(lowResp = .fitted - crit_val * .se.pred,
upResp = .fitted + crit_val * .se.pred)
# Plot the intervals using geom_ribbon()
ggplot(predResp, aes(x=Biological, y=Foster)) +
geom_point() +
stat_smooth(method="lm", se=FALSE) +
geom_ribbon(aes(ymin = lowResp, ymax = upResp), alpha = .2) +
geom_ribbon(data = predMeans, aes(ymin = lowMean, ymax = upMean), alpha = .2, fill = "red")
Chapter 4 - Technical Conditions in Linear Regression
Technical conditions for linear regression:
Effect of an outlier - can have unintended impact on the inferential conclusions:
Moving forward when model assumptions are violated:
Example code includes:
# A dataset containing well behaved observations has been preloaded and is called hypdata_nice. There are two variables in the dataset which are aptly named explanatory and response.
expl <- c(2.14, 3.04, 2.48, 2.91, 3.12, 2.45, 3.62, 3.7, 3.11, 4.18, 3.07, 3, 3.52, 3.3, 3.58, 1.35, 2.91, 3.66, 3.97, 3.23, 3.61, 3.45, 3.36, 2.95, 3.86, 4.38, 1.73, 3.1, 3.68, 3.62, 3.42, 2.61, 3.74, 3.1, 2.07, 3.85, 2.76, 4.03, 0.91, 4.29, 2.44, 3.14, 3.59, 2.03, 1.58, 2.2, 4.65, 2.1, 1.96, 2.57, 3.46, 3.07, 2.49, 3.64, 1.4, 2.64, 1.77, 3.29, 2.96, 2.82, 2.6, 3.23, 5.24, 3.89, 3.18, 2.94, 1.14, 3.04, 1.62, 3.09, 1.18, 2.76, 5.26, 2.34, 3.71, 3.26, 4.15, 2.45, 1.06, 2.11, 4.14, 3.22, 1.34, 1.58, 2.93, 2.14, 3.83, 3.97, 2.94, 2.24, 4.01, 5.46, 3.96, 3.14, 0.79, 3.75, 1.67, 2.95, 4.02, 2.9, 3.14, 3.89, 3.42, 3.36, 3.6, 2.03, 2.22, 3.82, 3.01, 1.51, 2.52, 3.23, 2.63, 5.22, 4.25, 3.26, 2.09, 4.08, 3.62, 2.08, 4.71, 2.16, 2.82, 3.16, 2.3, 1.77, 3.72, 3.91, 2.03, 0.3, 3.32, 2.79, 3.27, 2.71, 4.11, 2.15, 1.61, 4.23, 2.82, 4.99, 2.18, 3.02, 3.5, 1.32, 2.52, 1.61, 4.58, 1.99, 2.95, 2.87, 4.68, 3.84, 2.03, 3.2, 1.89, 2.15, 2.55, 3.44, 2.99, 2.47, 2.77, 3.66, 1.9, 3.77, 3.42, 3.96, 2.78, 0.81, -0.34, 1.28, 3.76, 4.19, 3.07, 3.03, 1.28, 2.31, 1.97, 1.67, 2.9, 2.6, 3.71, 3.94, 2.88, 4.74, 3.18, 3.14, 3.72, 2.38, 2.4, 4.16, 3.77, 3.16, 2.45, 1.75, 2.39, 3.26, 3.32, 3.35, 3.93, 1.55)
resp <- c(19.04, 21.44, 19.21, 20.63, 21.66, 15.99, 26.76, 28.85, 20.9, 19.16, 20.53, 22.79, 22.55, 21.72, 26.31, 18.15, 18.3, 28.03, 24.12, 20.47, 23.03, 22.06, 21.91, 20.4, 22.89, 28.62, 16.63, 22.41, 23.45, 20.06, 25.29, 15.54, 22.88, 20.3, 18.87, 26.13, 19.63, 23.3, 12.38, 27.77, 18.67, 23.12, 19.87, 20.51, 17.92, 20.25, 29.82, 13.36, 18.36, 21.2, 23.69, 17.57, 18.19, 23.12, 15.58, 18.53, 20, 21.8, 19.32, 18.8, 20.24, 24.26, 27.59, 26.02, 20.1, 17.24, 12.87, 20.05, 16.27, 19.7, 20.37, 22.49, 24.61, 23.41, 20.51, 20.4, 24, 23.17, 19.66, 17.06, 23.15, 18.48, 16.99, 14.89, 23.4, 19.94, 20.93, 24.21, 19.86, 17.71, 22.74, 25.86, 25.75, 21.4, 16.65, 25.84, 18.49, 18.95, 26, 19.74, 19.39, 25.16, 22.94, 22.01, 26.12, 19.21, 18.62, 25.96, 19.53, 15.48, 22.79, 19.91, 21.65, 29.06, 22.07, 19.59, 21.01, 26.21, 23.75, 18.61, 26.75, 17.06, 18.28, 21.06, 17.18, 17.07, 22.46, 21.55, 14.53, 10.98, 19.34, 21.72, 23.16, 21.76, 23.11, 16.42, 19.64, 24.33, 19.44, 26.39, 22.03, 19.44, 20.14, 17.29, 21.89, 17.28, 25.25, 17.96, 20.94, 21, 27.3, 20.64, 23.03, 17.6, 13.78, 19.08, 16.24, 27.76, 21.19, 21.96, 21.25, 20.94, 20.3, 22.67, 21.8, 21.34, 23.33, 15.82, 12.92, 16.54, 23.08, 24.23, 22.24, 19.79, 15.31, 20.81, 17.52, 20.92, 14.01, 18.63, 27.18, 21.96, 22.9, 26.57, 22.39, 22.09, 24.67, 20.51, 18.77, 23.92, 19.28, 21.14, 21.12, 15.82, 18.52, 25.09, 25.27, 22.47, 26.35, 16.52)
hypdata_nice <- data.frame(response=resp, explanatory=expl)
str(hypdata_nice)
## 'data.frame': 200 obs. of 2 variables:
## $ response : num 19 21.4 19.2 20.6 21.7 ...
## $ explanatory: num 2.14 3.04 2.48 2.91 3.12 2.45 3.62 3.7 3.11 4.18 ...
# Plot the data
ggplot(hypdata_nice, aes(x=explanatory, y=response)) +
geom_point()
# Create and augmented model
nice_lm <- lm(response ~ explanatory, data=hypdata_nice) %>%
broom::augment()
# Print the head of nice_lm
head(nice_lm)
## response explanatory .fitted .se.fit .resid .hat .sigma
## 1 19.04 2.14 18.69469 0.2035616 0.3453107 0.008481517 2.215806
## 2 21.44 3.04 21.23226 0.1568479 0.2077393 0.005035457 2.215894
## 3 19.21 2.48 19.65333 0.1738709 -0.4433274 0.006187790 2.215717
## 4 20.63 2.91 20.86572 0.1564783 -0.2357226 0.005011754 2.215879
## 5 21.66 3.12 21.45782 0.1584301 0.2021774 0.005137562 2.215896
## 6 15.99 2.45 19.56874 0.1760204 -3.5787417 0.006341730 2.201131
## .cooksd .std.resid
## 1 1.052794e-04 0.15689185
## 2 2.246532e-05 0.09422272
## 3 1.260166e-04 -0.20119317
## 4 2.878777e-05 -0.10691363
## 5 2.171440e-05 0.09170476
## 6 8.418718e-03 -1.62424910
# Plot the residuals
ggplot(nice_lm, aes(x=.fitted, y=.resid)) +
geom_point()
# A dataset containing poorly behaved observations has been preloaded and is called hypdata_poor. There are two variables in the dataset which are aptly named explanatory and response
resp <- c(19.08, 21.62, 19.15, 20.57, 21.84, 15.23, 29.92, 33.73, 20.66, 13.32, 20.16, 23.68, 22.54, 21.6, 29.13, 17.46, 17.2, 32.23, 24.34, 19.71, 23.2, 21.85, 21.8, 20.19, 22.25, 32.75, 16.7, 23.01, 23.8, 17.81, 27.43, 14.22, 22.57, 19.74, 18.89, 28.52, 19.39, 22.5, 13.66, 31.09, 18.52, 24.1, 17.58, 20.55, 17.67, 20.41, 34.94, 13.12, 18.35, 21.63, 24.64, 15.61, 17.88, 23.29, 15.76, 18.09, 19.68, 21.75, 18.56, 18.12, 20.38, 25.84, 27.38, 28.25, 19.27, 15.54, 13.97, 19.5, 16.38, 18.85, 18.39, 23.33, 19.43, 24.15, 18.26, 19.53, 23.52, 24.03, 17.56, 16.99, 21.78, 16.54, 16.67, 15.28, 24.62, 20.05, 18.58, 24.52, 19.41, 17.59, 21.46, 21.5, 27.59, 21.39, 15.27, 28.11, 18.25, 18.05, 27.96, 19.3, 18.25, 26.58, 23.42, 21.95, 28.78, 19.23, 18.62, 28.23, 18.78, 15.74, 23.63, 18.82, 22.2, 31.28, 19.05, 18.21, 21.14, 28.26, 24.47, 18.63, 27.6, 16.95, 17.39, 20.82, 16.92, 17.1, 21.87, 19.48, 14.47, 12.61, 17.62, 22.25, 24.02, 22.34, 21.83, 16.26, 19.09, 23.91, 19.02, 25.52, 22.33, 18.63, 18.39, 16.83, 22.5, 17.19, 24.61, 17.96, 20.98, 21.17, 29, 18.01, 23.1, 15.21, 13.99, 19.13, 15.29, 31.68, 21.3, 22.55, 21.61, 19.24, 20.18, 22.09, 21.47, 18.84, 24.5, 14.98, 10.67, 16.3, 22.91, 23.86, 22.79, 19.12, 15.5, 21.1, 17.53, 20.27, 10.99, 18.28, 30.65, 20.17, 23.9, 27.05, 22.89, 22.47, 25.97, 20.77, 18.68, 23.3, 15.71, 20.94, 21.52, 16, 18.39, 27.17, 27.45, 22.74, 28.83, 16.55)
expl <- c(2.14, 3.04, 2.48, 2.91, 3.12, 2.45, 3.62, 3.7, 3.11, 4.18, 3.07, 3, 3.52, 3.3, 3.58, 1.35, 2.91, 3.66, 3.97, 3.23, 3.61, 3.45, 3.36, 2.95, 3.86, 4.38, 1.73, 3.1, 3.68, 3.62, 3.42, 2.61, 3.74, 3.1, 2.07, 3.85, 2.76, 4.03, 0.91, 4.29, 2.44, 3.14, 3.59, 2.03, 1.58, 2.2, 4.65, 2.1, 1.96, 2.57, 3.46, 3.07, 2.49, 3.64, 1.4, 2.64, 1.77, 3.29, 2.96, 2.82, 2.6, 3.23, 5.24, 3.89, 3.18, 2.94, 1.14, 3.04, 1.62, 3.09, 1.18, 2.76, 5.26, 2.34, 3.71, 3.26, 4.15, 2.45, 1.06, 2.11, 4.14, 3.22, 1.34, 1.58, 2.93, 2.14, 3.83, 3.97, 2.94, 2.24, 4.01, 5.46, 3.96, 3.14, 0.79, 3.75, 1.67, 2.95, 4.02, 2.9, 3.14, 3.89, 3.42, 3.36, 3.6, 2.03, 2.22, 3.82, 3.01, 1.51, 2.52, 3.23, 2.63, 5.22, 4.25, 3.26, 2.09, 4.08, 3.62, 2.08, 4.71, 2.16, 2.82, 3.16, 2.3, 1.77, 3.72, 3.91, 2.03, 0.3, 3.32, 2.79, 3.27, 2.71, 4.11, 2.15, 1.61, 4.23, 2.82, 4.99, 2.18, 3.02, 3.5, 1.32, 2.52, 1.61, 4.58, 1.99, 2.95, 2.87, 4.68, 3.84, 2.03, 3.2, 1.89, 2.15, 2.55, 3.44, 2.99, 2.47, 2.77, 3.66, 1.9, 3.77, 3.42, 3.96, 2.78, 0.81, -0.34, 1.28, 3.76, 4.19, 3.07, 3.03, 1.28, 2.31, 1.97, 1.67, 2.9, 2.6, 3.71, 3.94, 2.88, 4.74, 3.18, 3.14, 3.72, 2.38, 2.4, 4.16, 3.77, 3.16, 2.45, 1.75, 2.39, 3.26, 3.32, 3.35, 3.93, 1.55)
hypdata_poor <- data.frame(response=resp, explanatory=expl)
str(hypdata_poor)
## 'data.frame': 200 obs. of 2 variables:
## $ response : num 19.1 21.6 19.1 20.6 21.8 ...
## $ explanatory: num 2.14 3.04 2.48 2.91 3.12 2.45 3.62 3.7 3.11 4.18 ...
# Plot the data
ggplot(hypdata_poor, aes(x=explanatory, y=response)) +
geom_point()
# Create an augmented model
poor_lm <- lm(response ~ explanatory, data=hypdata_poor) %>%
broom::augment()
# Plot the residuals
ggplot(poor_lm, aes(x=.fitted, y=.resid)) +
geom_point()
# The data provided in this exercise (hypdata_out) has an extreme outlier. You will run the linear model with and without the outlying point to see how one observation can affect the estimate of the line.
expl <- c(2.14, 3.04, 2.48, 2.91, 3.12, 2.45, 3.62, 3.7, 3.11, 4.18, 3.07, 3, 3.52, 3.3, 3.58, 1.35, 2.91, 3.66, 3.97, 3.23, 3.61, 3.45, 3.36, 2.95, 3.86, 4.38, 1.73, 3.1, 3.68, 3.62, 3.42, 2.61, 3.74, 3.1, 2.07, 3.85, 2.76, 4.03, 0.91, 4.29, 2.44, 3.14, 3.59, 2.03, 1.58, 2.2, 4.65, 2.1, 1.96, 2.57)
resp <- c(23.06, 21.85, 14.4, 27.13, 5.31, 15.71, 10.51, 26, 20.96, 22.71, 17.17, 23.26, 44.96, 30.77, 24.49, 15.47, 2.14, 23.32, 10.12, 22.58, 4.63, 19.91, 44.7, 14.24, 30.7, 27.72, 28.72, 15.84, 3.65, 13.99, 33.68, 22.02, 6.66, 7.07, 17.56, 14.94, 28.6, 33.76, 14.16, 17.31, 29.39, 46.02, 32.34, 19.49, -5.37, 26.08, 500, 17.81, 28.03, 18.73)
hypdata_out <- data.frame(response=resp, explanatory=expl)
str(hypdata_out)
## 'data.frame': 50 obs. of 2 variables:
## $ response : num 23.06 21.85 14.4 27.13 5.31 ...
## $ explanatory: num 2.14 3.04 2.48 2.91 3.12 2.45 3.62 3.7 3.11 4.18 ...
# Plot the data and a linear model
ggplot(hypdata_out, aes(x=explanatory, y=response)) +
geom_point() +
stat_smooth(method="lm", se=FALSE)
# Remove the outlier
hypdata_noout <- hypdata_out %>%
filter(explanatory < 4.6)
# Plot all the data and both models
ggplot(hypdata_out, aes(x=explanatory, y=response)) +
geom_point() +
stat_smooth(method="lm", se=FALSE) +
stat_smooth(data=hypdata_noout, method="lm", se=FALSE, color="red")
# Examine the tidy model
lm(response ~ explanatory, data=hypdata_out) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -46.27359 36.60763 -1.264042 0.21231907
## 2 explanatory 24.99987 11.55650 2.163274 0.03552965
# Examine the new tidy model
lm(response ~ explanatory, data=hypdata_noout) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 14.769176 6.185344 2.3877698 0.02102301
## 2 explanatory 1.957367 1.976048 0.9905467 0.32697934
# The data frames perm_slope_out and perm_slope_noout are also in your workspace. These data frames hold the permuted slopes for each of the original datsets
# Finally, the observed values are stored in the variables obs_slope_out and obs_slope_noout.
# Calculate the p-value with the outlier
# perm_slope_out %>%
# mutate(abs_perm_slope = abs(stat)) %>%
# summarize(p_value = mean(abs_perm_slope > abs(obs_slope_out)))
# Calculate the p-value without the outlier
# perm_slope_noout %>%
# mutate(abs_perm_slope = abs(stat)) %>%
# summarize(p_value = mean(abs_perm_slope > abs(obs_slope_noout)))
# The dataset data_nonlin has been preloaded.
expl <- c(0.19, 0.34, 0.51, 0.45, 0.3, 0.92, 0.47, 0.93, 0.55, 0.93, 0.29, 0, 0.73, 0.69, 0.76, 0.86, 0.54, 0.95, 0.88, 0.64, 0.53, 0.39, 0.5, 0.53, 0.7, 0.11, 0.62, 0.69, 0.72, 0.27, 0.05, 0.45, 0.46, 0.51, 0.74, 0.86, 0.83, 0.17, 0.59, 0.45, 0.73, 0.31, 0.67, 0.83, 0.64, 0.35, 0.48, 0.2, 0.8, 0.83, 0.92, 0.34, 0.1, 0.91, 0.54, 0.08, 0.75, 0.45, 0.73, 0.11, 0.66, 0.31, 0.35, 0.18, 0.77, 0.96, 0.54, 0.59, 0.18, 0.15, 0.8, 0.21, 0.4, 0.27, 0.85, 0.64, 0.02, 0.84, 0.9, 0.42, 0.29, 0.83, 0.56, 0.78, 0.72, 0.51, 0.17, 0.18, 0.08, 0.71, 0.21, 0.97, 0.95, 0.64, 0.18, 0.55, 0.15, 0.72, 0.33, 0.73)
resp <- c(11.62, 12.81, 15.01, 15.05, 10.67, 25.18, 13.41, 25.96, 16.09, 25.48, 11.75, 10.24, 22.46, 20.02, 21.09, 23.65, 14.23, 26.45, 22.89, 18.12, 13.96, 13.25, 17.5, 15.18, 20.11, 10.79, 18.75, 18.65, 17.96, 10.99, 11.33, 14.58, 12.96, 14.1, 20.47, 22.88, 23.77, 11.89, 16.97, 13.6, 21.14, 14.87, 19.76, 23.05, 15.82, 13.64, 13.56, 11.11, 23.14, 22.78, 25.49, 13.64, 10.88, 25.64, 16.58, 9.34, 19.95, 15.15, 20.22, 9.04, 18.09, 12.59, 12.51, 13.24, 22.49, 26.83, 15.11, 18.06, 11.61, 9.84, 23.75, 10.41, 13.49, 12.02, 22.65, 16.75, 10.78, 24.02, 23.93, 11.28, 12.44, 22.55, 16.57, 21.24, 21.07, 14.67, 9.53, 12.24, 10.15, 21.68, 10.5, 27.13, 26.91, 16.33, 10.58, 14.71, 12.36, 18.83, 12.6, 20)
data_nonlin <- data.frame(response=resp, explanatory=expl)
str(data_nonlin)
## 'data.frame': 100 obs. of 2 variables:
## $ response : num 11.6 12.8 15 15.1 10.7 ...
## $ explanatory: num 0.19 0.34 0.51 0.45 0.3 0.92 0.47 0.93 0.55 0.93 ...
# Create an augmented model using the non-linear data
lm_nonlin <- lm(response ~ explanatory, data=data_nonlin) %>%
broom::augment()
# Plot the residuals
ggplot(lm_nonlin, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# Create a second augmented model
lm2_nonlin <- lm(response ~ explanatory + I(explanatory^2), data=data_nonlin) %>%
broom::augment()
# Plot the second set of residuals
ggplot(lm2_nonlin, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# In this next example, it appears as though the variance of the response variable increases as the explanatory variable increases
# Note that the fix in this exercise has the effect of changing both the variability as well as modifying the linearity of the relationship
# The dataset data_nonequalvar has been preloaded
expl <- c(48.9, 78.2, 39.5, 42.9, 79.9, 57.9, 35.1, 50.7, 62.6, 63.3, 38.1, 75.8, 43.6, 48.8, 78, 44.8, 29.2, 57.7, 60.6, 63.3, 47.3, 54.3, 67.3, 54.8, 52.9, 50.6, 78, 62.2, 45.8, 40.5, 52.1, 58.7, 31.1, 61.8, 52.6, 63.7, 37.8, 42.5, 29.5, 51.3, 57.2, 70.3, 46.1, 91.2, 74.9, 47.3, 34.8, 29.3, 42, 57.9, 48.1, 35, 56.7, 71.9, 47.9, 49.9, 36.3, 53.6, 50.3, 60, 53.6, 67.1, 55.6, 44.2, 56.4, 41.5, 30.4, 84.1, 48.8, 59.6, 50.8, 59.3, 94.6, 70.5, 45.4, 44.8, 36.2, 87.6, 68.9, 55.8, 64.9, 33.2, 89.9, 37.9, 54.7, 64.6, 71.6, 65.8, 48.3, 67.5, 62.1, 63.6, 67.8, 54.2, 55.6, 65.4, 55.4, 50.2, 81.3, 57.9, 62.1, 55.3, 75.5, 65, 65.6, 53.1, 71.3, 53.1, 63.3, 45.3, 61, 54, 44.8, 66.5, 55.2, 67.8, 43.2, 46.9, 57.1, 92.2, 70.1, 49.7, 46.2, 67, 29.8, 40.8, 62.6, 60.4, 86.4, 42.2, 42.9, 69.5, 63.1, 46.2, 38.5, 43.7, 53.3, 60.3, 32.6, 72.5)
resp <- c(127.15, 45.06, 15.54, 26.25, 17.78, 47.81, 14.22, 104.44, 134.4, 3.21, 138.4, 59.52, 26.54, 4.43, 65.15, 21.73, 8.6, 132.58, 84.87, 242.71, 23.36, 21.16, 29.59, 100.78, 135.44, 21.39, 90.71, 12.22, 34.61, 104.3, 102.54, 9.25, 32.13, 17.37, 22.74, 20.3, 99.88, 33.7, 26.17, 9.67, 2.23, 173.31, 46.49, 339.71, 110.22, 82.22, 4.93, 6.09, 12.88, 37.66, 59.45, 5.12, 37.84, 67.36, 30.94, 30.22, 12.6, 14.14, 106.09, 52.13, 4.72, 35.19, 7.49, 35.67, 28.08, 56.13, 66.75, 69.87, 65.66, 9.08, 89.92, 20.81, 43.22, 59.37, 21.8, 34.34, 1.65, 92.08, 36.89, 63.7, 23.8, 15.55, 79.21, 35.77, 74.66, 55.85, 58.33, 41.08, 53.43, 47.58, 46.57, 23.1, 305.41, 51.99, 39.4, 49.44, 116.64, 110, 120.17, 41.52, 60.48, 26.31, 121.42, 111.76, 33.76, 43.43, 150.36, 31.19, 30.25, 74.32, 132.18, 34.32, 20.45, 106.13, 47.9, 110.07, 66.47, 19.96, 42.72, 361.12, 281.52, 139.26, 22.22, 26.9, 7.66, 3.78, 52.8, 47.61, 81.24, 80.17, 19.48, 7.72, 43.5, 51.48, 37.18, 15.6, 36.02, 6.85, 15.42, 214.95)
data_nonequalvar <- data.frame(response=resp, explanatory=expl)
str(data_nonequalvar)
## 'data.frame': 140 obs. of 2 variables:
## $ response : num 127.2 45.1 15.5 26.2 17.8 ...
## $ explanatory: num 48.9 78.2 39.5 42.9 79.9 57.9 35.1 50.7 62.6 63.3 ...
# Create an augmented model
lm_nonequalvar <- lm(response ~ explanatory, data=data_nonequalvar) %>%
broom::augment()
# Plot the residuals
ggplot(lm_nonequalvar, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# Create an augmented model using the log of the response
lm2_nonequalvar <- lm(log(response) ~ explanatory, data=data_nonequalvar) %>%
broom::augment()
# Plot the log of the resoponse
ggplot(data_nonequalvar, aes(x=explanatory, y=log(response))) +
geom_point() +
stat_smooth(method="lm", se=FALSE)
# Plot the second set of residuals
ggplot(lm2_nonequalvar, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# In this last example, it appears as though the points are not normally distributed around the regression line
# Again, note that the fix in this exercise has the effect of changing both the variability as well as modifying the linearity of the relationship
# The dataset data_nonnorm has been preloaded
resp <- c(190.58, 187.28, 172.34, 291.5, 43.66, 315.81, 94.42, 417.19, 234.56, 343.66, 127.73, 119.66, 690.5, 416.69, 334.43, 337.93, 64.21, 386.13, 176.61, 280.16, 64.13, 167.81, 578.18, 160.62, 393.2, 147.78, 432.51, 216.36, 88.97, 83.47, 226.85, 235.78, 64.11, 89.33, 295.13, 230.52, 469.54, 241.46, 246.09, 131.25, 453.87, 527.73, 422.67, 356.65, 56.87, 272, 89.89, 138.36, 488.88, 321.64, 388.48, 287.1, 161.48, 423.7, 315.97, 47.3, 207.29, 314.94, 300.95, 26.88, 215.97, 196.3, 144.08, 428.02, 516.09, 423.95, 138.73, 408.28, 202.76, 60.61, 617.6, 75.43, 177.11, 176.66, 246.48, 131.29, 170.23, 485.36, 229.54, 11.85, 200.46, 304.64, 276.42, 277.58, 468.18, 138.25, 37.26, 279.1, 101.36, 628.17, 78.01, 391.71, 462.21, 92.89, 98.36, 97.8, 317.43, 172.27, 172.11, 281.36)
expl <- c(0.19, 0.34, 0.51, 0.45, 0.3, 0.92, 0.47, 0.93, 0.55, 0.93, 0.29, 0, 0.73, 0.69, 0.76, 0.86, 0.54, 0.95, 0.88, 0.64, 0.53, 0.39, 0.5, 0.53, 0.7, 0.11, 0.62, 0.69, 0.72, 0.27, 0.05, 0.45, 0.46, 0.51, 0.74, 0.86, 0.83, 0.17, 0.59, 0.45, 0.73, 0.31, 0.67, 0.83, 0.64, 0.35, 0.48, 0.2, 0.8, 0.83, 0.92, 0.34, 0.1, 0.91, 0.54, 0.08, 0.75, 0.45, 0.73, 0.11, 0.66, 0.31, 0.35, 0.18, 0.77, 0.96, 0.54, 0.59, 0.18, 0.15, 0.8, 0.21, 0.4, 0.27, 0.85, 0.64, 0.02, 0.84, 0.9, 0.42, 0.29, 0.83, 0.56, 0.78, 0.72, 0.51, 0.17, 0.18, 0.08, 0.71, 0.21, 0.97, 0.95, 0.64, 0.18, 0.55, 0.15, 0.72, 0.33, 0.73)
data_nonnorm <- data.frame(response=resp, explanatory=expl)
str(data_nonnorm)
## 'data.frame': 100 obs. of 2 variables:
## $ response : num 190.6 187.3 172.3 291.5 43.7 ...
## $ explanatory: num 0.19 0.34 0.51 0.45 0.3 0.92 0.47 0.93 0.55 0.93 ...
# Create an augmented model of the data
lm_nonnorm <- lm(response ~ explanatory, data=data_nonnorm) %>%
broom::augment()
# Plot the residuals
ggplot(lm_nonnorm, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# Create the second augmented model
lm2_nonnorm <- lm(sqrt(response) ~ explanatory, data=data_nonnorm) %>%
broom::augment()
# Plot the square root of the response
ggplot(data_nonnorm, aes(x=explanatory, y=sqrt(response))) +
geom_point() +
stat_smooth(method="lm", se=FALSE)
# Plot the second set of residuals
ggplot(lm2_nonnorm, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
Chapter 5 - Building on Inference in Simple Regression
Inference on transformed variables - interpretation of the coefficients is no longer just slope for y ~ x:
Multicollinearity - process of some or more of the predictor variables being correlated:
Multiple linear regression:
Summary:
Example code includes:
LAhomes <- readr::read_csv("./RInputFiles/LAhomes.csv")
## Parsed with column specification:
## cols(
## city = col_character(),
## type = col_character(),
## bed = col_integer(),
## bath = col_double(),
## garage = col_character(),
## sqft = col_integer(),
## pool = col_character(),
## spa = col_character(),
## price = col_double()
## )
str(LAhomes, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1594 obs. of 9 variables:
## $ city : chr "Long Beach" "Long Beach" "Long Beach" "Long Beach" ...
## $ type : chr NA NA NA NA ...
## $ bed : int 0 0 0 0 0 1 1 1 1 1 ...
## $ bath : num 1 1 1 1 1 1 1 1 1 1 ...
## $ garage: chr NA NA NA "1" ...
## $ sqft : int 513 550 550 1030 1526 552 558 596 744 750 ...
## $ pool : chr NA NA NA NA ...
## $ spa : chr NA NA NA NA ...
## $ price : num 119000 153000 205000 300000 375000 ...
restNYC <- readr::read_csv("./RInputFiles/restNYC.csv")
## Parsed with column specification:
## cols(
## Case = col_integer(),
## Restaurant = col_character(),
## Price = col_integer(),
## Food = col_integer(),
## Decor = col_integer(),
## Service = col_integer(),
## East = col_integer()
## )
str(restNYC, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 168 obs. of 7 variables:
## $ Case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Restaurant: chr "Daniella Ristorante" "Tello's Ristorante" "Biricchino" "Bottino" ...
## $ Price : int 43 32 34 41 54 52 34 34 39 44 ...
## $ Food : int 22 20 21 20 24 22 22 20 22 21 ...
## $ Decor : int 18 19 13 20 19 22 16 18 19 17 ...
## $ Service : int 20 19 18 17 21 21 21 21 22 19 ...
## $ East : int 0 0 0 0 0 0 0 1 1 1 ...
# Using tidy output, run an lm analysis on price versus sqft for the LAhomes dataset.
# Run one more analysis, but this time on transformed variables: log(price) versus log(sqft).
# Create a tidy model
lm(price ~ sqft, data=LAhomes) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -1661892.391 64459.91198 -25.78180 8.851557e-123
## 2 sqft 1485.995 22.70924 65.43569 0.000000e+00
# Create a tidy model using the log of both variables
lm(log(price) ~ log(sqft), data=LAhomes) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 2.702788 0.14369289 18.80948 1.972382e-71
## 2 log(sqft) 1.441583 0.01953529 73.79375 0.000000e+00
# Output the tidy model
lm(log(price) ~ log(sqft) + log(bath), data=LAhomes) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 2.51405101 0.26186485 9.6005668 2.957034e-21
## 2 log(sqft) 1.47120722 0.03952669 37.2206050 1.194181e-218
## 3 log(bath) -0.03904504 0.04528729 -0.8621632 3.887276e-01
# Using the NYC Italian restaurants dataset (compiled by Simon Sheather in A Modern Approach to Regression with R), restNYC,
# you will investigate the effect on the significance of the coefficients when there are multiple variables in the model
# Recall, the p-value associated with any coefficient is the probability of the observed data given that the particular variable is independent of the response AND given that all other variables are included in the model.
# Output the first model
lm(Price ~ Service, data=restNYC) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -11.977811 5.1092741 -2.344327 2.024510e-02
## 2 Service 2.818433 0.2618399 10.763954 7.879529e-21
# Output the second model
lm(Price ~ Service + Food + Decor, data=restNYC) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -24.6408955 4.7536113 -5.1836160 6.332777e-07
## 2 Service 0.1350457 0.3956525 0.3413239 7.332967e-01
## 3 Food 1.5555712 0.3730821 4.1695147 4.932501e-05
## 4 Decor 1.8473352 0.2175539 8.4913900 1.170666e-14
Chapter 1 - Parallel Slopes
What if you have two groups?
Visualizing parallel slopes models:
Interpreting parallel slopes coefficients:
Three ways to describe a model - Mathematical, Geometric, Syntactic:
Example code includes:
# In this case, we want to understand how the price of MarioKart games sold at auction varies as a function of not only the number of wheels included in the package, but also whether the item is new or used
# A parallel slopes model has the form y ~ x + z, where z is a categorical explanatory variable, and x is a numerical explanatory variable
# Explore the data
data(marioKart, package="openintro")
glimpse(marioKart)
## Observations: 143
## Variables: 12
## $ ID <dbl> 150377422259, 260483376854, 320432342985, 280405224...
## $ duration <int> 3, 7, 3, 3, 1, 3, 1, 1, 3, 7, 1, 1, 1, 1, 7, 7, 3, ...
## $ nBids <int> 20, 13, 16, 18, 20, 19, 13, 15, 29, 8, 15, 15, 13, ...
## $ cond <fct> new, used, new, new, new, new, used, new, used, use...
## $ startPr <dbl> 0.99, 0.99, 0.99, 0.99, 0.01, 0.99, 0.01, 1.00, 0.9...
## $ shipPr <dbl> 4.00, 3.99, 3.50, 0.00, 0.00, 4.00, 0.00, 2.99, 4.0...
## $ totalPr <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53...
## $ shipSp <fct> standard, firstClass, firstClass, standard, media, ...
## $ sellerRate <int> 1580, 365, 998, 7, 820, 270144, 7284, 4858, 27, 201...
## $ stockPhoto <fct> yes, yes, no, yes, yes, yes, yes, yes, yes, no, yes...
## $ wheels <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, ...
## $ title <fct> ~~ Wii MARIO KART & WHEEL ~ NINTENDO Wii ~ BRAN...
# fit parallel slopes
(mod <- lm(totalPr ~ wheels + cond, data=marioKart))
##
## Call:
## lm(formula = totalPr ~ wheels + cond, data = marioKart)
##
## Coefficients:
## (Intercept) wheels condused
## 37.6673 10.2161 0.8457
# The parallel slopes model mod relating total price to the number of wheels and condition is already in your workspace.
# Augment the model
augmented_mod <- broom::augment(mod)
glimpse(augmented_mod)
## Observations: 143
## Variables: 10
## $ totalPr <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53...
## $ wheels <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, ...
## $ cond <fct> new, used, new, new, new, new, used, new, used, use...
## $ .fitted <dbl> 47.88342, 48.72916, 47.88342, 47.88342, 58.09954, 3...
## $ .se.fit <dbl> 3.532896, 2.696310, 3.532896, 3.532896, 3.375000, 5...
## $ .resid <dbl> 3.666579, -11.689162, -2.383421, -3.883421, 12.9004...
## $ .hat <dbl> 0.02093127, 0.01219196, 0.02093127, 0.02093127, 0.0...
## $ .sigma <dbl> 24.504954, 24.486658, 24.506118, 24.504709, 24.4820...
## $ .cooksd <dbl> 1.640983e-04, 9.543509e-04, 6.933998e-05, 1.840819e...
## $ .std.resid <dbl> 0.15174745, -0.48163062, -0.09864186, -0.16072186, ...
# scatterplot, with color
data_space <- ggplot(augmented_mod, aes(x = wheels, y = totalPr, color = cond)) +
geom_point()
# single call to geom_line()
data_space +
geom_line(aes(y = .fitted))
# The babies data set contains observations about the birthweight and other characteristics of children born in the San Francisco Bay area from 1960--1967
# We would like to build a model for birthweight as a function of the mother's age and whether this child was her first (parity == 0)
# birthweight=β0+β1⋅age+β2⋅parity+ϵ
data(babies, package="openintro")
str(babies)
## 'data.frame': 1236 obs. of 8 variables:
## $ case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ bwt : int 120 113 128 123 108 136 138 132 120 143 ...
## $ gestation: int 284 282 279 NA 282 286 244 245 289 299 ...
## $ parity : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 27 33 28 36 23 25 33 23 25 30 ...
## $ height : int 62 64 64 69 67 62 62 65 62 66 ...
## $ weight : int 100 135 115 190 125 93 178 140 125 136 ...
## $ smoke : int 0 0 1 0 1 0 0 0 0 1 ...
# build model
lm(bwt ~ age + parity, data=babies)
##
## Call:
## lm(formula = bwt ~ age + parity, data = babies)
##
## Coefficients:
## (Intercept) age parity
## 118.27782 0.06315 -1.65248
# build model
lm(bwt ~ gestation + smoke, data=babies)
##
## Call:
## lm(formula = bwt ~ gestation + smoke, data = babies)
##
## Coefficients:
## (Intercept) gestation smoke
## -0.9317 0.4429 -8.0883
Chapter 2 - Evaluating and Extending Parallel Slopes
Model fit, residuals, and prediction:
Understanding interaction - idea that the model might have both different slopes and different intercepts:
* The interaction term is the product of two (or more) variables - for example Y ~ X1 + X2 + X1:X2
* The R syntax colon(:) means X1X2, or ther interaction between X1 and X2
Interaction terms can change the intepretation of the model, and also of the components of the model
* Including an interaction term in a model is easy—we just have to tell lm() that we want to include that new variable. An expression of the form
* lm(y ~ x + z + x:z, data = mydata)
* The use of the colon (:) here means that the interaction between x and z will be a third term in the model
* Interaction models are easy to visualize in the with ggplot2 because they have the same coefficients as if the models were fit independently to each group defined by the level of the categorical variable
* In this case, new and used MarioKarts each get their own regression line
* To see this, we can set an aesthetic (e.g. color) to the categorical variable, and then add a geom_smooth() layer to overlay the regression line for each color
Simpson’s Paradox:
Example code includes:
mario_kart <- marioKart %>% filter(totalPr <= 75)
# fit parallel slopes
(mod <- lm(totalPr ~ wheels + cond, data=mario_kart))
##
## Call:
## lm(formula = totalPr ~ wheels + cond, data = mario_kart)
##
## Coefficients:
## (Intercept) wheels condused
## 42.370 7.233 -5.585
# R^2 and adjusted R^2
summary(mod)
##
## Call:
## lm(formula = totalPr ~ wheels + cond, data = mario_kart)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0078 -3.0754 -0.8254 2.9822 14.1646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.3698 1.0651 39.780 < 2e-16 ***
## wheels 7.2328 0.5419 13.347 < 2e-16 ***
## condused -5.5848 0.9245 -6.041 1.35e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.887 on 138 degrees of freedom
## Multiple R-squared: 0.7165, Adjusted R-squared: 0.7124
## F-statistic: 174.4 on 2 and 138 DF, p-value: < 2.2e-16
# add random noise
mario_kart_noisy <- mario_kart %>%
mutate(noise = rnorm(n=n()))
# compute new model
mod2 <- lm(totalPr ~ wheels + cond + noise, data=mario_kart_noisy)
# new R^2 and adjusted R^2
summary(mod2)
##
## Call:
## lm(formula = totalPr ~ wheels + cond + noise, data = mario_kart_noisy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9844 -2.9707 -0.9883 2.8480 14.2079
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.5665 1.0701 39.777 < 2e-16 ***
## wheels 7.2002 0.5404 13.324 < 2e-16 ***
## condused -5.7546 0.9288 -6.196 6.36e-09 ***
## noise -0.5537 0.3888 -1.424 0.157
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.869 on 137 degrees of freedom
## Multiple R-squared: 0.7207, Adjusted R-squared: 0.7145
## F-statistic: 117.8 on 3 and 137 DF, p-value: < 2.2e-16
# return a vector
predict(mod)
## 1 2 3 4 5 6 7 8
## 49.60260 44.01777 49.60260 49.60260 56.83544 42.36976 36.78493 56.83544
## 9 10 11 12 13 14 15 16
## 44.01777 44.01777 56.83544 56.83544 56.83544 56.83544 44.01777 36.78493
## 17 18 19 20 21 22 23 24
## 49.60260 49.60260 56.83544 36.78493 56.83544 56.83544 56.83544 44.01777
## 25 26 27 28 29 30 31 32
## 56.83544 36.78493 36.78493 36.78493 49.60260 36.78493 36.78493 44.01777
## 33 34 35 36 37 38 39 40
## 51.25061 44.01777 44.01777 36.78493 44.01777 56.83544 56.83544 49.60260
## 41 42 43 44 45 46 47 48
## 44.01777 51.25061 56.83544 56.83544 44.01777 56.83544 36.78493 36.78493
## 49 50 51 52 53 54 55 56
## 44.01777 56.83544 36.78493 44.01777 42.36976 36.78493 36.78493 44.01777
## 57 58 59 60 61 62 63 64
## 44.01777 36.78493 36.78493 56.83544 36.78493 56.83544 36.78493 51.25061
## 65 66 67 68 69 70 71 72
## 56.83544 44.01777 58.48345 51.25061 49.60260 44.01777 49.60260 56.83544
## 73 74 75 76 77 78 79 80
## 56.83544 51.25061 44.01777 36.78493 36.78493 36.78493 44.01777 56.83544
## 81 82 83 84 85 86 87 88
## 44.01777 65.71629 44.01777 56.83544 36.78493 49.60260 49.60260 36.78493
## 89 90 91 92 93 94 95 96
## 44.01777 36.78493 51.25061 44.01777 36.78493 51.25061 42.36976 56.83544
## 97 98 99 100 101 102 103 104
## 51.25061 44.01777 51.25061 56.83544 56.83544 56.83544 36.78493 49.60260
## 105 106 107 108 109 110 111 112
## 51.25061 44.01777 56.83544 49.60260 36.78493 44.01777 51.25061 56.83544
## 113 114 115 116 117 118 119 120
## 64.06828 44.01777 49.60260 44.01777 49.60260 51.25061 42.36976 44.01777
## 121 122 123 124 125 126 127 128
## 56.83544 44.01777 49.60260 44.01777 51.25061 56.83544 56.83544 49.60260
## 129 130 131 132 133 134 135 136
## 56.83544 36.78493 44.01777 44.01777 36.78493 56.83544 36.78493 44.01777
## 137 138 139 140 141
## 36.78493 51.25061 49.60260 36.78493 56.83544
# return a data frame
broom::augment(mod)
## totalPr wheels cond .fitted .se.fit .resid .hat
## 1 51.55 1 new 49.60260 0.7087865 1.94739955 0.02103158
## 2 37.04 1 used 44.01777 0.5465195 -6.97776738 0.01250410
## 3 45.50 1 new 49.60260 0.7087865 -4.10260045 0.02103158
## 4 44.00 1 new 49.60260 0.7087865 -5.60260045 0.02103158
## 5 71.00 2 new 56.83544 0.6764502 14.16455915 0.01915635
## 6 45.00 0 new 42.36976 1.0651119 2.63023994 0.04749321
## 7 37.02 0 used 36.78493 0.7065565 0.23507301 0.02089945
## 8 53.99 2 new 56.83544 0.6764502 -2.84544085 0.01915635
## 9 47.00 1 used 44.01777 0.5465195 2.98223262 0.01250410
## 10 50.00 1 used 44.01777 0.5465195 5.98223262 0.01250410
## 11 54.99 2 new 56.83544 0.6764502 -1.84544085 0.01915635
## 12 56.01 2 new 56.83544 0.6764502 -0.82544085 0.01915635
## 13 48.00 2 new 56.83544 0.6764502 -8.83544085 0.01915635
## 14 56.00 2 new 56.83544 0.6764502 -0.83544085 0.01915635
## 15 43.33 1 used 44.01777 0.5465195 -0.68776738 0.01250410
## 16 46.00 0 used 36.78493 0.7065565 9.21507301 0.02089945
## 17 46.71 1 new 49.60260 0.7087865 -2.89260045 0.02103158
## 18 46.00 1 new 49.60260 0.7087865 -3.60260045 0.02103158
## 19 55.99 2 new 56.83544 0.6764502 -0.84544085 0.01915635
## 20 31.00 0 used 36.78493 0.7065565 -5.78492699 0.02089945
## 21 53.98 2 new 56.83544 0.6764502 -2.85544085 0.01915635
## 22 64.95 2 new 56.83544 0.6764502 8.11455915 0.01915635
## 23 50.50 2 new 56.83544 0.6764502 -6.33544085 0.01915635
## 24 46.50 1 used 44.01777 0.5465195 2.48223262 0.01250410
## 25 55.00 2 new 56.83544 0.6764502 -1.83544085 0.01915635
## 26 34.50 0 used 36.78493 0.7065565 -2.28492699 0.02089945
## 27 36.00 0 used 36.78493 0.7065565 -0.78492699 0.02089945
## 28 40.00 0 used 36.78493 0.7065565 3.21507301 0.02089945
## 29 47.00 1 new 49.60260 0.7087865 -2.60260045 0.02103158
## 30 43.00 0 used 36.78493 0.7065565 6.21507301 0.02089945
## 31 31.00 0 used 36.78493 0.7065565 -5.78492699 0.02089945
## 32 41.99 1 used 44.01777 0.5465195 -2.02776738 0.01250410
## 33 49.49 2 used 51.25061 0.8279109 -1.76060777 0.02869514
## 34 41.00 1 used 44.01777 0.5465195 -3.01776738 0.01250410
## 35 44.78 1 used 44.01777 0.5465195 0.76223262 0.01250410
## 36 47.00 0 used 36.78493 0.7065565 10.21507301 0.02089945
## 37 44.00 1 used 44.01777 0.5465195 -0.01776738 0.01250410
## 38 63.99 2 new 56.83544 0.6764502 7.15455915 0.01915635
## 39 53.76 2 new 56.83544 0.6764502 -3.07544085 0.01915635
## 40 46.03 1 new 49.60260 0.7087865 -3.57260045 0.02103158
## 41 42.25 1 used 44.01777 0.5465195 -1.76776738 0.01250410
## 42 46.00 2 used 51.25061 0.8279109 -5.25060777 0.02869514
## 43 51.99 2 new 56.83544 0.6764502 -4.84544085 0.01915635
## 44 55.99 2 new 56.83544 0.6764502 -0.84544085 0.01915635
## 45 41.99 1 used 44.01777 0.5465195 -2.02776738 0.01250410
## 46 53.99 2 new 56.83544 0.6764502 -2.84544085 0.01915635
## 47 39.00 0 used 36.78493 0.7065565 2.21507301 0.02089945
## 48 38.06 0 used 36.78493 0.7065565 1.27507301 0.02089945
## 49 46.00 1 used 44.01777 0.5465195 1.98223262 0.01250410
## 50 59.88 2 new 56.83544 0.6764502 3.04455915 0.01915635
## 51 28.98 0 used 36.78493 0.7065565 -7.80492699 0.02089945
## 52 36.00 1 used 44.01777 0.5465195 -8.01776738 0.01250410
## 53 51.99 0 new 42.36976 1.0651119 9.62023994 0.04749321
## 54 43.95 0 used 36.78493 0.7065565 7.16507301 0.02089945
## 55 32.00 0 used 36.78493 0.7065565 -4.78492699 0.02089945
## 56 40.06 1 used 44.01777 0.5465195 -3.95776738 0.01250410
## 57 48.00 1 used 44.01777 0.5465195 3.98223262 0.01250410
## 58 36.00 0 used 36.78493 0.7065565 -0.78492699 0.02089945
## 59 31.00 0 used 36.78493 0.7065565 -5.78492699 0.02089945
## 60 53.99 2 new 56.83544 0.6764502 -2.84544085 0.01915635
## 61 30.00 0 used 36.78493 0.7065565 -6.78492699 0.02089945
## 62 58.00 2 new 56.83544 0.6764502 1.16455915 0.01915635
## 63 38.10 0 used 36.78493 0.7065565 1.31507301 0.02089945
## 64 61.76 2 used 51.25061 0.8279109 10.50939223 0.02869514
## 65 53.99 2 new 56.83544 0.6764502 -2.84544085 0.01915635
## 66 40.00 1 used 44.01777 0.5465195 -4.01776738 0.01250410
## 67 64.50 3 used 58.48345 1.2882085 6.01655183 0.06947257
## 68 49.01 2 used 51.25061 0.8279109 -2.24060777 0.02869514
## 69 47.00 1 new 49.60260 0.7087865 -2.60260045 0.02103158
## 70 40.10 1 used 44.01777 0.5465195 -3.91776738 0.01250410
## 71 41.50 1 new 49.60260 0.7087865 -8.10260045 0.02103158
## 72 56.00 2 new 56.83544 0.6764502 -0.83544085 0.01915635
## 73 64.95 2 new 56.83544 0.6764502 8.11455915 0.01915635
## 74 49.00 2 used 51.25061 0.8279109 -2.25060777 0.02869514
## 75 48.00 1 used 44.01777 0.5465195 3.98223262 0.01250410
## 76 38.00 0 used 36.78493 0.7065565 1.21507301 0.02089945
## 77 45.00 0 used 36.78493 0.7065565 8.21507301 0.02089945
## 78 41.95 0 used 36.78493 0.7065565 5.16507301 0.02089945
## 79 43.36 1 used 44.01777 0.5465195 -0.65776738 0.01250410
## 80 54.99 2 new 56.83544 0.6764502 -1.84544085 0.01915635
## 81 45.21 1 used 44.01777 0.5465195 1.19223262 0.01250410
## 82 65.02 4 used 65.71629 1.7946635 -0.69628856 0.13483640
## 83 45.75 1 used 44.01777 0.5465195 1.73223262 0.01250410
## 84 64.00 2 new 56.83544 0.6764502 7.16455915 0.01915635
## 85 36.00 0 used 36.78493 0.7065565 -0.78492699 0.02089945
## 86 54.70 1 new 49.60260 0.7087865 5.09739955 0.02103158
## 87 49.91 1 new 49.60260 0.7087865 0.30739955 0.02103158
## 88 47.00 0 used 36.78493 0.7065565 10.21507301 0.02089945
## 89 43.00 1 used 44.01777 0.5465195 -1.01776738 0.01250410
## 90 35.99 0 used 36.78493 0.7065565 -0.79492699 0.02089945
## 91 54.49 2 used 51.25061 0.8279109 3.23939223 0.02869514
## 92 46.00 1 used 44.01777 0.5465195 1.98223262 0.01250410
## 93 31.06 0 used 36.78493 0.7065565 -5.72492699 0.02089945
## 94 55.60 2 used 51.25061 0.8279109 4.34939223 0.02869514
## 95 40.10 0 new 42.36976 1.0651119 -2.26976006 0.04749321
## 96 52.59 2 new 56.83544 0.6764502 -4.24544085 0.01915635
## 97 44.00 2 used 51.25061 0.8279109 -7.25060777 0.02869514
## 98 38.26 1 used 44.01777 0.5465195 -5.75776738 0.01250410
## 99 51.00 2 used 51.25061 0.8279109 -0.25060777 0.02869514
## 100 48.99 2 new 56.83544 0.6764502 -7.84544085 0.01915635
## 101 66.44 2 new 56.83544 0.6764502 9.60455915 0.01915635
## 102 63.50 2 new 56.83544 0.6764502 6.66455915 0.01915635
## 103 42.00 0 used 36.78493 0.7065565 5.21507301 0.02089945
## 104 47.00 1 new 49.60260 0.7087865 -2.60260045 0.02103158
## 105 55.00 2 used 51.25061 0.8279109 3.74939223 0.02869514
## 106 33.01 1 used 44.01777 0.5465195 -11.00776738 0.01250410
## 107 53.76 2 new 56.83544 0.6764502 -3.07544085 0.01915635
## 108 46.00 1 new 49.60260 0.7087865 -3.60260045 0.02103158
## 109 43.00 0 used 36.78493 0.7065565 6.21507301 0.02089945
## 110 42.55 1 used 44.01777 0.5465195 -1.46776738 0.01250410
## 111 52.50 2 used 51.25061 0.8279109 1.24939223 0.02869514
## 112 57.50 2 new 56.83544 0.6764502 0.66455915 0.01915635
## 113 75.00 3 new 64.06828 1.0000415 10.93171876 0.04186751
## 114 48.92 1 used 44.01777 0.5465195 4.90223262 0.01250410
## 115 45.99 1 new 49.60260 0.7087865 -3.61260045 0.02103158
## 116 40.05 1 used 44.01777 0.5465195 -3.96776738 0.01250410
## 117 45.00 1 new 49.60260 0.7087865 -4.60260045 0.02103158
## 118 50.00 2 used 51.25061 0.8279109 -1.25060777 0.02869514
## 119 49.75 0 new 42.36976 1.0651119 7.38023994 0.04749321
## 120 47.00 1 used 44.01777 0.5465195 2.98223262 0.01250410
## 121 56.00 2 new 56.83544 0.6764502 -0.83544085 0.01915635
## 122 41.00 1 used 44.01777 0.5465195 -3.01776738 0.01250410
## 123 46.00 1 new 49.60260 0.7087865 -3.60260045 0.02103158
## 124 34.99 1 used 44.01777 0.5465195 -9.02776738 0.01250410
## 125 49.00 2 used 51.25061 0.8279109 -2.25060777 0.02869514
## 126 61.00 2 new 56.83544 0.6764502 4.16455915 0.01915635
## 127 62.89 2 new 56.83544 0.6764502 6.05455915 0.01915635
## 128 46.00 1 new 49.60260 0.7087865 -3.60260045 0.02103158
## 129 64.95 2 new 56.83544 0.6764502 8.11455915 0.01915635
## 130 36.99 0 used 36.78493 0.7065565 0.20507301 0.02089945
## 131 44.00 1 used 44.01777 0.5465195 -0.01776738 0.01250410
## 132 41.35 1 used 44.01777 0.5465195 -2.66776738 0.01250410
## 133 37.00 0 used 36.78493 0.7065565 0.21507301 0.02089945
## 134 58.98 2 new 56.83544 0.6764502 2.14455915 0.01915635
## 135 39.00 0 used 36.78493 0.7065565 2.21507301 0.02089945
## 136 40.70 1 used 44.01777 0.5465195 -3.31776738 0.01250410
## 137 39.51 0 used 36.78493 0.7065565 2.72507301 0.02089945
## 138 52.00 2 used 51.25061 0.8279109 0.74939223 0.02869514
## 139 47.70 1 new 49.60260 0.7087865 -1.90260045 0.02103158
## 140 38.76 0 used 36.78493 0.7065565 1.97507301 0.02089945
## 141 54.51 2 new 56.83544 0.6764502 -2.32544085 0.01915635
## .sigma .cooksd .std.resid
## 1 4.902339 1.161354e-03 0.402708933
## 2 4.868399 8.712334e-03 -1.436710863
## 3 4.892414 5.154337e-03 -0.848389768
## 4 4.881308 9.612441e-03 -1.158579529
## 5 4.750591 5.574926e-02 2.926332759
## 6 4.899816 5.053659e-03 0.551419180
## 7 4.905181 1.681147e-05 0.048608215
## 8 4.899077 2.249739e-03 -0.587854989
## 9 4.898517 1.591419e-03 0.614036807
## 10 4.878184 6.403658e-03 1.231731888
## 11 4.902639 9.463096e-04 -0.381259589
## 12 4.904706 1.893237e-04 -0.170532281
## 13 4.845644 2.169149e-02 -1.825361432
## 14 4.904693 1.939387e-04 -0.172598235
## 15 4.904866 8.464177e-05 -0.141610176
## 16 4.840263 2.583436e-02 1.905485609
## 17 4.898859 2.562308e-03 -0.598170028
## 18 4.895349 3.974537e-03 -0.744993181
## 19 4.904680 1.986092e-04 -0.174664189
## 20 4.879726 1.018113e-02 -1.196202689
## 21 4.899034 2.265580e-03 -0.589920943
## 22 4.855017 1.829628e-02 1.676430592
## 23 4.874681 1.115287e-02 -1.308872933
## 24 4.900578 1.102520e-03 0.511087627
## 25 4.902666 9.360817e-04 -0.379193635
## 26 4.901254 1.588345e-03 -0.472475419
## 27 4.904754 1.874384e-04 -0.162306590
## 28 4.897361 3.144719e-03 0.664810290
## 29 4.900072 2.074290e-03 -0.538200007
## 30 4.875781 1.175148e-02 1.285147949
## 31 4.879726 1.018113e-02 -1.196202689
## 32 4.902124 7.357628e-04 -0.417513979
## 33 4.902848 1.315656e-03 -0.365515142
## 34 4.898356 1.629570e-03 -0.621353356
## 35 4.904785 1.039625e-04 0.156942447
## 36 4.825276 3.174557e-02 2.112264829
## 37 4.905222 5.648698e-08 -0.003658274
## 38 4.866239 1.422325e-02 1.478099008
## 39 4.898043 2.628135e-03 -0.635371930
## 40 4.895513 3.908619e-03 -0.738789386
## 41 4.902867 5.591802e-04 -0.363980405
## 42 4.884059 1.170136e-02 -1.090064849
## 43 4.887380 6.523784e-03 -1.001045788
## 44 4.904680 1.986092e-04 -0.174664189
## 45 4.902124 7.357628e-04 -0.417513979
## 46 4.899077 2.249739e-03 -0.587854989
## 47 4.901493 1.492713e-03 0.458031070
## 48 4.903987 4.946184e-04 0.263658603
## 49 4.902261 7.030898e-04 0.408138446
## 50 4.898186 2.575620e-03 0.628991915
## 51 4.858711 1.853266e-02 -1.613896713
## 52 4.856546 1.150293e-02 -1.650845158
## 53 4.832389 6.760627e-02 2.016844445
## 54 4.866054 1.561857e-02 1.481588208
## 55 4.887793 6.965475e-03 -0.989423469
## 56 4.893406 2.802864e-03 -0.814897814
## 57 4.893260 2.837624e-03 0.819935167
## 58 4.904754 1.874384e-04 -0.162306590
## 59 4.879726 1.018113e-02 -1.196202689
## 60 4.899077 2.249739e-03 -0.587854989
## 61 4.870114 1.400524e-02 -1.402981909
## 62 4.904194 3.768392e-04 0.240592564
## 63 4.903908 5.261382e-04 0.271929772
## 64 4.819876 4.687834e-02 2.181827236
## 65 4.899077 2.249739e-03 -0.587854989
## 66 4.893045 2.888492e-03 -0.827251716
## 67 4.876193 4.052940e-02 1.276155547
## 68 4.901375 2.130829e-03 -0.465166678
## 69 4.900072 2.074290e-03 -0.538200007
## 70 4.893644 2.746495e-03 -0.806661880
## 71 4.855070 2.010496e-02 -1.675562463
## 72 4.904693 1.939387e-04 -0.172598235
## 73 4.855017 1.829628e-02 1.676430592
## 74 4.901341 2.149892e-03 -0.467242751
## 75 4.893260 2.837624e-03 0.819935167
## 76 4.904101 4.491639e-04 0.251251850
## 77 4.853667 2.053161e-02 1.698706389
## 78 4.884908 8.116206e-03 1.068029769
## 79 4.904897 7.741876e-05 -0.135433225
## 80 4.902639 9.463096e-04 -0.381259589
## 81 4.904152 2.543452e-04 0.245478742
## 82 4.904806 1.218734e-03 -0.153165404
## 83 4.902961 5.369254e-04 0.356663856
## 84 4.866129 1.426303e-02 1.480164962
## 85 4.904754 1.874384e-04 -0.162306590
## 86 4.885435 7.957044e-03 1.054107431
## 87 4.905151 2.893749e-05 0.063568128
## 88 4.825276 3.174557e-02 2.112264829
## 89 4.904442 1.853526e-04 -0.209556635
## 90 4.904742 1.922448e-04 -0.164374382
## 91 4.897178 4.453937e-03 0.672521687
## 92 4.902261 7.030898e-04 0.408138446
## 93 4.880253 9.971030e-03 -1.183795936
## 94 4.890710 8.029235e-03 0.902965863
## 95 4.901197 3.763354e-03 -0.475846028
## 96 4.891531 5.008164e-03 -0.877088548
## 97 4.864786 2.231340e-02 -1.505279580
## 98 4.880180 5.932118e-03 -1.185514863
## 99 4.905174 2.665668e-05 -0.052028020
## 100 4.858308 1.710282e-02 -1.620831987
## 101 4.834741 2.563232e-02 1.984257737
## 102 4.871414 1.234172e-02 1.376867262
## 103 4.884512 8.274103e-03 1.078368730
## 104 4.900072 2.074290e-03 -0.538200007
## 105 4.894442 5.966763e-03 0.778401443
## 106 4.813060 2.168204e-02 -2.266481255
## 107 4.898043 2.628135e-03 -0.635371930
## 108 4.895349 3.974537e-03 -0.744993181
## 109 4.875781 1.175148e-02 1.285147949
## 110 4.903599 3.854926e-04 -0.302210897
## 111 4.904027 6.625438e-04 0.259383029
## 112 4.904888 1.227157e-04 0.137294864
## 113 4.811529 7.605411e-02 2.285052621
## 114 4.887082 4.300207e-03 1.009361659
## 115 4.895294 3.996633e-03 -0.747061113
## 116 4.893346 2.817046e-03 -0.816956798
## 117 4.889096 6.487255e-03 -0.951786355
## 118 4.904024 6.638336e-04 -0.259635386
## 119 4.862490 3.978837e-02 1.547237493
## 120 4.898517 1.591419e-03 0.614036807
## 121 4.904693 1.939387e-04 -0.172598235
## 122 4.898356 1.629570e-03 -0.621353356
## 123 4.895349 3.974537e-03 -0.744993181
## 124 4.843427 1.458352e-02 -1.858802502
## 125 4.901341 2.149892e-03 -0.467242751
## 126 4.892049 4.819157e-03 0.860378763
## 127 4.877336 1.018587e-02 1.250844068
## 128 4.895349 3.974537e-03 -0.744993181
## 129 4.855017 1.829628e-02 1.676430592
## 130 4.905191 1.279432e-05 0.042404838
## 131 4.905222 5.648698e-08 -0.003658274
## 132 4.899857 1.273496e-03 -0.549288929
## 133 4.905187 1.407252e-05 0.044472630
## 134 4.901733 1.277936e-03 0.443056056
## 135 4.901493 1.492713e-03 0.458031070
## 136 4.896922 1.969670e-03 -0.683122864
## 137 4.899576 2.259209e-03 0.563488472
## 138 4.904792 2.383611e-04 0.155579346
## 139 4.902471 1.108535e-03 -0.393444786
## 140 4.902257 1.186770e-03 0.408404057
## 141 4.901119 1.502601e-03 -0.480425381
# include interaction
lm(totalPr ~ cond + duration + cond:duration, data=mario_kart)
##
## Call:
## lm(formula = totalPr ~ cond + duration + cond:duration, data = mario_kart)
##
## Coefficients:
## (Intercept) condused duration
## 58.268 -17.122 -1.966
## condused:duration
## 2.325
# interaction plot
ggplot(mario_kart, aes(x=duration, y=totalPr, color=cond)) +
geom_point() +
geom_smooth(method="lm", se=FALSE)
slr <- ggplot(mario_kart, aes(y = totalPr, x = duration)) +
geom_point() +
geom_smooth(method = "lm", se = 0)
# model with one slope
lm(totalPr ~ duration, data=mario_kart)
##
## Call:
## lm(formula = totalPr ~ duration, data = mario_kart)
##
## Coefficients:
## (Intercept) duration
## 52.374 -1.317
# plot with two slopes
slr + aes(color=cond)
Chapter 3 - Multiple Regression
Adding a numerical explanatory variable - regressions with 2+ numerical variables:
Conditional interpretation of coefficients:
Adding a third (categorical) variable:
Higher dimensions:
Example code includes:
# Fit the model using duration and startPr
(mod <- lm(totalPr ~ duration + startPr, data=mario_kart))
##
## Call:
## lm(formula = totalPr ~ duration + startPr, data = mario_kart)
##
## Coefficients:
## (Intercept) duration startPr
## 51.030 -1.508 0.233
# One method for visualizing a multiple linear regression model is to create a heatmap of the fitted values in the plane defined by the two explanatory variables
# This heatmap will illustrate how the model output changes over different combinations of the explanatory variables
# This is a multistep process
# First, create a grid of the possible pairs of values of the explanatory variables. The grid should be over the actual range of the data present in each variable. We've done this for you and stored the result as a data frame called grid
# Use augment() with the newdata argument to find the y-hat corresponding to the values in grid
# Add these to the data_space plot by using the fill aesthetic and geom_tile()
# add predictions to grid
grid <- expand.grid(duration=1:10, startPr=seq(0.01, 69.95, by=0.01))
price_hats <- broom::augment(mod, newdata=grid)
# tile the plane
data_space <- mario_kart %>% filter(totalPr <= 75) %>%
ggplot(aes(x=duration, y=startPr)) +
geom_point(aes(col=totalPr))
data_space +
geom_tile(data = price_hats, aes(fill=.fitted), alpha=0.5)
# An alternative way to visualize a multiple regression model with two numeric explanatory variables is as a plane in three dimensions. This is possible in R using the plotly package
# We have created three objects that you will need
# x: a vector of unique values of duration
# y: a vector of unique values of startPr
# plane: a matrix of the fitted values across all combinations of x and y
# draw the 3D scatterplot
p <- plotly::plot_ly(data = mario_kart, z = ~totalPr, x = ~duration, y = ~startPr, opacity = 0.6) %>%
plotly::add_markers()
# draw the plane
x <- c(1, 1.13, 1.261, 1.391, 1.522, 1.652, 1.783, 1.913, 2.043, 2.174, 2.304, 2.435, 2.565, 2.696, 2.826, 2.957, 3.087, 3.217, 3.348, 3.478, 3.609, 3.739, 3.87, 4, 4.13, 4.261, 4.391, 4.522, 4.652, 4.783, 4.913, 5.043, 5.174, 5.304, 5.435, 5.565, 5.696, 5.826, 5.957, 6.087, 6.217, 6.348, 6.478, 6.609, 6.739, 6.87, 7, 7.13, 7.261, 7.391, 7.522, 7.652, 7.783, 7.913, 8.043, 8.174, 8.304, 8.435, 8.565, 8.696, 8.826, 8.957, 9.087, 9.217, 9.348, 9.478, 9.609, 9.739, 9.87, 10)
y <- c(0.01, 1.024, 2.037, 3.051, 4.064, 5.078, 6.092, 7.105, 8.119, 9.133, 10.146, 11.16, 12.173, 13.187, 14.201, 15.214, 16.228, 17.242, 18.255, 19.269, 20.282, 21.296, 22.31, 23.323, 24.337, 25.351, 26.364, 27.378, 28.391, 29.405, 30.419, 31.432, 32.446, 33.46, 34.473, 35.487, 36.5, 37.514, 38.528, 39.541, 40.555, 41.569, 42.582, 43.596, 44.609, 45.623, 46.637, 47.65, 48.664, 49.678, 50.691, 51.705, 52.718, 53.732, 54.746, 55.759, 56.773, 57.787, 58.8, 59.814, 60.827, 61.841, 62.855, 63.868, 64.882, 65.896, 66.909, 67.923, 68.936, 69.95)
grid <- expand.grid(duration=x, startPr=y)
predPr <- broom::augment(mod, newdata=grid)
plane <- matrix(data=predPr$.fitted, nrow=70, ncol=70, byrow=FALSE)
p <- p %>%
plotly::add_surface(x = ~x, y = ~y, z = ~plane, showscale = FALSE)
# Commented due to inability to use in html
# p
# draw the 3D scatterplot
# p <- plotly::plot_ly(data = mario_kart, z = ~totalPr, x = ~duration, y = ~startPr, opacity = 0.6) %>%
# plotly::add_markers(color = ~cond)
# draw two planes
# p %>%
# add_surface(x = ~x, y = ~y, z = ~plane0, showscale = FALSE) %>%
# add_surface(x = ~x, y = ~y, z = ~plane1, showscale = FALSE)
Chapter 4 - Logistic Regression
What is logistic regression?
Visualizing logistic regression:
Three scales approach to visualization:
Using a logistical model - objective is to gain better understanding in to the underlying process:
Example code includes:
# To see this in action, we'll fit a linear regression model to data about 55 students who applied to medical school
# We want to understand how their undergraduate GPAGPA relates to the probability they will be accepted by a particular school (Acceptance)
# The medical school acceptance data is loaded in your workspace as MedGPA
# scatterplot with jitter
# tmpSAT <- readr::read_csv("./RInputFiles/SAT.csv")
tmpAccept <- c(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0)
tmpGPA <- c(3.62, 3.84, 3.23, 3.69, 3.38, 3.72, 3.89, 3.34, 3.71, 3.89, 3.97, 3.49, 3.77, 3.61, 3.3, 3.54, 3.65, 3.54, 3.25, 3.89, 3.71, 3.77, 3.91, 3.88, 3.68, 3.56, 3.44, 3.58, 3.4, 3.82, 3.62, 3.09, 3.89, 3.7, 3.24, 3.86, 3.54, 3.4, 3.87, 3.14, 3.37, 3.38, 3.62, 3.94, 3.37, 3.36, 3.97, 3.04, 3.29, 3.67, 2.72, 3.56, 3.48, 2.8, 3.44)
MedGPA <- data.frame(Acceptance=tmpAccept, GPA=tmpGPA)
str(MedGPA)
## 'data.frame': 55 obs. of 2 variables:
## $ Acceptance: num 0 1 1 1 1 1 1 0 1 1 ...
## $ GPA : num 3.62 3.84 3.23 3.69 3.38 3.72 3.89 3.34 3.71 3.89 ...
data_space <- ggplot(MedGPA, aes(x=GPA, y=Acceptance)) +
geom_jitter(width = 0, height = 0.05, alpha = 0.5)
# linear regression line
data_space +
geom_smooth(method="lm", se=FALSE)
# filter
MedGPA_middle <- MedGPA %>%
filter(GPA >= 3.375, GPA <= 3.77)
# scatterplot with jitter
data_space <- ggplot(MedGPA_middle, aes(x=GPA, y=Acceptance)) +
geom_jitter(width = 0, height = 0.05, alpha = 0.5)
# linear regression line
data_space +
geom_smooth(method="lm", se=FALSE)
# fit model
(mod <- glm(Acceptance ~ GPA, data = MedGPA, family = binomial))
##
## Call: glm(formula = Acceptance ~ GPA, family = binomial, data = MedGPA)
##
## Coefficients:
## (Intercept) GPA
## -19.207 5.454
##
## Degrees of Freedom: 54 Total (i.e. Null); 53 Residual
## Null Deviance: 75.79
## Residual Deviance: 56.84 AIC: 60.84
# scatterplot with jitter
data_space <- ggplot(MedGPA, aes(x=GPA, y=Acceptance)) +
geom_jitter(width=0, height=0.05, alpha = .5)
# add logistic curve
data_space +
geom_smooth(method="glm", se=FALSE, method.args=structure(list(family = "binomial"), .Names = "family"))
# We have created a data.frame called MedGPA_binned that aggregates the original data into separate bins for each 0.25 of GPA. It also contains the fitted values from the logistic regression model
MedGPA$bin <- round(MedGPA$GPA*4, 0) / 4
str(MedGPA)
## 'data.frame': 55 obs. of 3 variables:
## $ Acceptance: num 0 1 1 1 1 1 1 0 1 1 ...
## $ GPA : num 3.62 3.84 3.23 3.69 3.38 3.72 3.89 3.34 3.71 3.89 ...
## $ bin : num 3.5 3.75 3.25 3.75 3.5 3.75 4 3.25 3.75 4 ...
MedGPA_binned <- MedGPA %>%
group_by(bin) %>%
summarize(mean_GPA=mean(GPA), acceptance_rate=mean(Acceptance), ct=n())
MedGPA_binned
## # A tibble: 6 x 4
## bin mean_GPA acceptance_rate ct
## <dbl> <dbl> <dbl> <int>
## 1 2.75 2.76 0 2
## 2 3.00 3.06 0 2
## 3 3.25 3.29 0.200 10
## 4 3.50 3.51 0.556 18
## 5 3.75 3.75 0.643 14
## 6 4.00 3.91 1.00 9
# binned points and line
data_space <- ggplot(MedGPA_binned, aes(x=mean_GPA, y=acceptance_rate)) +
geom_point() +
geom_line()
# augmented model
MedGPA_plus <- broom::augment(mod, type.predict="response")
# logistic model on probability scale
data_space +
geom_line(data = MedGPA_plus, aes(x=GPA, y=.fitted), color = "red")
# The MedGPA_binned data frame contains the data for each GPA bin, while the MedGPA_plus data frame records the original observations after being augment()-ed by mod
# compute odds for bins
MedGPA_binned <- MedGPA_binned %>%
mutate(odds = acceptance_rate / (1 - acceptance_rate))
# plot binned odds
data_space <- ggplot(MedGPA_binned, aes(x=mean_GPA, y=odds)) +
geom_point() +
geom_line()
# compute odds for observations
MedGPA_plus <- MedGPA_plus %>%
mutate(odds_hat = .fitted / (1 - .fitted))
# logistic model on odds scale
data_space +
geom_line(data=MedGPA_plus, aes(x=GPA, y=odds_hat), color = "red")
# compute log odds for bins
MedGPA_binned <- MedGPA_binned %>%
mutate(log_odds = log(acceptance_rate / (1 - acceptance_rate)))
# plot binned log odds
data_space <- ggplot(MedGPA_binned, aes(x=mean_GPA, y=log_odds)) +
geom_point() +
geom_line()
# compute log odds for observations
MedGPA_plus <- MedGPA_plus %>%
mutate(log_odds_hat = log(.fitted / (1 - .fitted)))
# logistic model on log odds scale
data_space +
geom_line(data=MedGPA_plus, aes(x=GPA, y=log_odds_hat), color = "red")
# create new data frame
new_data <- data.frame(GPA = 3.51)
# make predictions
broom::augment(mod, newdata=new_data, type.predict="response")
## GPA .fitted .se.fit
## 1 3.51 0.4844099 0.08343193
# data frame with binary predictions
tidy_mod <- broom::augment(mod, type.predict="response") %>%
mutate(Acceptance_hat = round(.fitted))
# confusion matrix
tidy_mod %>%
select(Acceptance, Acceptance_hat) %>%
table()
## Acceptance_hat
## Acceptance 0 1
## 0 16 9
## 1 6 24
Chapter 5 - Case Study: Italian Restaurants in NYC
Italian restaurants in NYC - factors that contribute to the price of a meal:
Incorporating another variable:
Higher dimensions - adding the décor dimension to the existing Zagat analysis:
Wrap up:
Example code includes:
nyc <- readr::read_csv("./RInputFiles/nyc.csv")
## Parsed with column specification:
## cols(
## Case = col_integer(),
## Restaurant = col_character(),
## Price = col_integer(),
## Food = col_integer(),
## Decor = col_integer(),
## Service = col_integer(),
## East = col_integer()
## )
str(nyc, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 168 obs. of 7 variables:
## $ Case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Restaurant: chr "Daniella Ristorante" "Tello's Ristorante" "Biricchino" "Bottino" ...
## $ Price : int 43 32 34 41 54 52 34 34 39 44 ...
## $ Food : int 22 20 21 20 24 22 22 20 22 21 ...
## $ Decor : int 18 19 13 20 19 22 16 18 19 17 ...
## $ Service : int 20 19 18 17 21 21 21 21 22 19 ...
## $ East : int 0 0 0 0 0 0 0 1 1 1 ...
# Price by Food plot
ggplot(nyc, aes(x=Food, y=Price)) +
geom_point()
# Price by Food model
lm(Price ~ Food, data=nyc)
##
## Call:
## lm(formula = Price ~ Food, data = nyc)
##
## Coefficients:
## (Intercept) Food
## -17.832 2.939
# fit model
lm(Price ~ Food + Service, data=nyc)
##
## Call:
## lm(formula = Price ~ Food + Service, data = nyc)
##
## Coefficients:
## (Intercept) Food Service
## -21.159 1.495 1.704
# draw 3D scatterplot
# p <- plot_ly(data = nyc, z = ~Price, x = ~Food, y = ~Service, opacity = 0.6) %>%
# add_markers()
# draw a plane
# p %>%
# add_surface(x = ~x, y = ~y, z = ~plane, showscale = FALSE)
# Price by Food and Service and East
lm(Price ~ Food + Service + East, data=nyc)
##
## Call:
## lm(formula = Price ~ Food + Service + East, data = nyc)
##
## Coefficients:
## (Intercept) Food Service East
## -20.8155 1.4863 1.6647 0.9649
# draw 3D scatterplot
# p <- plot_ly(data = nyc, z = ~Price, x = ~Food, y = ~Service, opacity = 0.6) %>%
# add_markers(color = ~factor(East))
# draw two planes
# p %>%
# add_surface(x = ~x, y = ~y, z = ~plane0, showscale = FALSE) %>%
# add_surface(x = ~x, y = ~y, z = ~plane1, showscale = FALSE)
Chapter 1 - Exploring and Visualizing Time Series in R
Introduction and overview:
Trends, seasonality, and cyclicity:
White noise - simply a time series of independently and identically distributed (iid) data:
Example code includes:
library(forecast)
# Read the data from Excel into R
mydata <- readxl::read_excel("./RInputFiles/exercise1.xlsx")
# Look at the first few lines of mydata
head(mydata)
## # A tibble: 6 x 4
## X__1 Sales AdBudget GDP
## <chr> <dbl> <dbl> <dbl>
## 1 Mar-81 1020 659 252
## 2 Jun-81 889 589 291
## 3 Sep-81 795 512 291
## 4 Dec-81 1004 614 292
## 5 Mar-82 1058 647 279
## 6 Jun-82 944 602 254
# Create a ts object called myts
myts <- ts(mydata[, -1], start = c(1981, 1), frequency = 4)
# Plot the data with facetting
autoplot(myts, facets = TRUE)
# Plot the data without facetting
autoplot(myts, facets = FALSE)
# Plot the three series
data(gold, package="forecast")
data(woolyrnq, package="forecast")
data(gas, package="forecast")
str(gold)
## Time-Series [1:1108] from 1 to 1108: 306 300 303 297 304 ...
str(woolyrnq)
## Time-Series [1:119] from 1965 to 1994: 6172 6709 6633 6660 6786 ...
str(gas)
## Time-Series [1:476] from 1956 to 1996: 1709 1646 1794 1878 2173 ...
autoplot(gold)
autoplot(woolyrnq)
autoplot(gas)
# Find the outlier in the gold series
goldoutlier <- which.max(gold)
# Look at the seasonal frequencies of the three series
frequency(gold)
## [1] 1
frequency(woolyrnq)
## [1] 4
frequency(gas)
## [1] 12
# In this exercise, you will load the fpp2 package and use two of its datasets
# a10 contains monthly sales volumes for anti-diabetic drugs in Australia. In the plots, can you see which month has the highest sales volume each year? What is unusual about the results in March and April 2008?
# ausbeer which contains quarterly beer production for Australia. What is happening to the beer production in Quarter 4?
# Load the fpp2 package
# library(fpp2)
data(a10, package="fpp2")
data(ausbeer, package="fpp2")
str(a10)
## Time-Series [1:204] from 1992 to 2008: 3.53 3.18 3.25 3.61 3.57 ...
str(ausbeer)
## Time-Series [1:218] from 1956 to 2010: 284 213 227 308 262 228 236 320 272 233 ...
# Create plots of the a10 data
autoplot(a10)
forecast::ggseasonplot(a10)
# Produce a polar coordinate season plot for the a10 data
forecast::ggseasonplot(a10, polar = TRUE)
# Restrict the ausbeer data to start in 1992
beer <- window(ausbeer, start=1992)
# Make plots of the beer data
autoplot(beer)
forecast::ggsubseriesplot(beer)
# In this exercise, you will work with the pre-loaded oil data (available in the package fpp2), which contains the annual oil production in Saudi Arabia from 1965-2013 (measured in millions of tons).
# Create an autoplot of the oil data
data(oil, package="fpp2")
str(oil)
## Time-Series [1:49] from 1965 to 2013: 111 131 141 154 163 ...
autoplot(oil)
# Create a lag plot of the oil data
forecast::gglagplot(oil)
# Create an ACF plot of the oil data
library(forecast)
ggAcf(oil)
# You will investigate this phenomenon by plotting the annual sunspot series (which follows the solar cycle of approximately 10-11 years) in sunspot.year
# and the daily traffic to the Hyndsight blog (which follows a 7-day weekly pattern) in hyndsight. Both objects have been loaded into your workspace.
# Plot the annual sunspot numbers
data(sunspot.year)
str(sunspot.year)
## Time-Series [1:289] from 1700 to 1988: 5 11 16 23 36 58 29 20 10 8 ...
autoplot(sunspot.year)
ggAcf(sunspot.year)
# Plot the traffic on the Hyndsight blog
data(hyndsight, package="fpp2")
str(hyndsight)
## Time-Series [1:365] from 1.43 to 53.4: 1157 1118 1310 874 890 1437 1263 1187 1506 1448 ...
autoplot(hyndsight)
ggAcf(hyndsight)
# You can test this hypothesis by looking at the goog series, which contains the closing stock price for Google over 1000 trading days ending on February 13, 2017. This data has been loaded into your workspace.
# Plot the original series
data(goog, package="fpp2")
str(goog)
## Time-Series [1:1000] from 1 to 1000: 393 393 397 398 400 ...
autoplot(goog)
# Plot the differenced series
autoplot(diff(goog))
# ACF of the differenced series
ggAcf(diff(goog))
# Ljung-Box test of the differenced series
Box.test(diff(goog), lag = 10, type = "Ljung")
##
## Box-Ljung test
##
## data: diff(goog)
## X-squared = 13.123, df = 10, p-value = 0.2169
Chapter 2 - Benchmark methods and forecast accuracy
Forecasts and potential futures:
Fitted values and residuals:
Training and test sets help to validate the forecasting methodology:
To subset observations from 101 to 500
train <- subset.ts(x, start = 101, end = 500, …)
To subset the first 500 observations
train <- subset.ts(x, end = 500, …)
Time series cross-validation attempts to solve some of the problems related to test-train:
Example code includes:
# Use naive() to forecast the goog series
fcgoog <- naive(goog, h=20)
# Plot and summarize the forecasts
autoplot(fcgoog)
summary(fcgoog)
##
## Forecast method: Naive method
##
## Model Information:
## Call: naive(y = goog, h = 20)
##
## Residual sd: 8.7285
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.4212612 8.734286 5.829407 0.06253998 0.9741428 1
## ACF1
## Training set 0.03871446
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1001 813.67 802.4765 824.8634 796.5511 830.7889
## 1002 813.67 797.8401 829.4999 789.4602 837.8797
## 1003 813.67 794.2824 833.0576 784.0192 843.3208
## 1004 813.67 791.2831 836.0569 779.4322 847.9078
## 1005 813.67 788.6407 838.6993 775.3910 851.9490
## 1006 813.67 786.2518 841.0882 771.7374 855.6025
## 1007 813.67 784.0549 843.2850 768.3777 858.9623
## 1008 813.67 782.0102 845.3298 765.2505 862.0895
## 1009 813.67 780.0897 847.2503 762.3133 865.0266
## 1010 813.67 778.2732 849.0667 759.5353 867.8047
## 1011 813.67 776.5456 850.7944 756.8931 870.4469
## 1012 813.67 774.8948 852.4452 754.3684 872.9715
## 1013 813.67 773.3115 854.0285 751.9470 875.3930
## 1014 813.67 771.7880 855.5520 749.6170 877.7230
## 1015 813.67 770.3180 857.0220 747.3688 879.9711
## 1016 813.67 768.8962 858.4437 745.1944 882.1455
## 1017 813.67 767.5183 859.8217 743.0870 884.2530
## 1018 813.67 766.1802 861.1597 741.0407 886.2993
## 1019 813.67 764.8789 862.4610 739.0505 888.2895
## 1020 813.67 763.6114 863.7286 737.1120 890.2280
# Use snaive() to forecast the ausbeer series
fcbeer <- snaive(ausbeer, h=16)
# Plot and summarize the forecasts
autoplot(fcbeer)
summary(fcbeer)
##
## Forecast method: Seasonal naive method
##
## Model Information:
## Call: snaive(y = ausbeer, h = 16)
##
## Residual sd: 19.1207
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3.098131 19.32591 15.50935 0.838741 3.69567 1 0.01093868
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2010 Q3 419 394.2329 443.7671 381.1219 456.8781
## 2010 Q4 488 463.2329 512.7671 450.1219 525.8781
## 2011 Q1 414 389.2329 438.7671 376.1219 451.8781
## 2011 Q2 374 349.2329 398.7671 336.1219 411.8781
## 2011 Q3 419 383.9740 454.0260 365.4323 472.5677
## 2011 Q4 488 452.9740 523.0260 434.4323 541.5677
## 2012 Q1 414 378.9740 449.0260 360.4323 467.5677
## 2012 Q2 374 338.9740 409.0260 320.4323 427.5677
## 2012 Q3 419 376.1020 461.8980 353.3932 484.6068
## 2012 Q4 488 445.1020 530.8980 422.3932 553.6068
## 2013 Q1 414 371.1020 456.8980 348.3932 479.6068
## 2013 Q2 374 331.1020 416.8980 308.3932 439.6068
## 2013 Q3 419 369.4657 468.5343 343.2438 494.7562
## 2013 Q4 488 438.4657 537.5343 412.2438 563.7562
## 2014 Q1 414 364.4657 463.5343 338.2438 489.7562
## 2014 Q2 374 324.4657 423.5343 298.2438 449.7562
# Check the residuals from the naive forecasts applied to the goog series
goog %>% naive() %>% checkresiduals()
##
## Ljung-Box test
##
## data: Residuals from Naive method
## Q* = 13.123, df = 10, p-value = 0.2169
##
## Model df: 0. Total lags used: 10
# Do they look like white noise (TRUE or FALSE)
googwn <- TRUE
# Check the residuals from the seasonal naive forecasts applied to the ausbeer series
ausbeer %>% snaive() %>% checkresiduals()
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 60.535, df = 8, p-value = 3.661e-10
##
## Model df: 0. Total lags used: 8
# Do they look like white noise (TRUE or FALSE)
beerwn <- FALSE
# The pre-loaded time series gold comprises daily gold prices for 1108 days. Here, you'll use the first 1000 days as a training set, and compute forecasts for the remaining 108 days
# Create the training data as train
train <- subset(gold, end = 1000)
# Compute naive forecasts and save to naive_fc
naive_fc <- naive(train, h = 108)
# Compute mean forecasts and save to mean_fc
mean_fc <- meanf(train, h = 108)
# Use accuracy() to compute RMSE statistics
accuracy(naive_fc, gold)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.09161392 6.33977 3.158386 0.01662141 0.794523 1.000000
## Test set -6.53834951 15.84236 13.638350 -1.74622688 3.428789 4.318139
## ACF1 Theil's U
## Training set -0.3098928 NA
## Test set 0.9793153 5.335899
accuracy(mean_fc, gold)
## ME RMSE MAE MPE MAPE MASE
## Training set -4.239671e-15 59.17809 53.63397 -2.390227 14.230224 16.981449
## Test set 1.319363e+01 19.55255 15.66875 3.138577 3.783133 4.960998
## ACF1 Theil's U
## Training set 0.9907254 NA
## Test set 0.9793153 6.123788
# Assign one of the two forecasts as bestforecasts
# bestforecasts <- naive_fc
# Here, you will use the Melbourne quarterly visitor numbers (vn[, "Melbourne"]) to create three different training sets, omitting the last 1, 2 and 3 years, respectively
# Inspect the pre-loaded vn data in your console before beginning the exercise
# This will help you determine the correct value to use for the keyword h (which specifies the number of values you want to forecast) in your forecasting methods
melData <- c(4.865, 4.113, 4.422, 5.171, 5.55, 4.009, 3.986, 3.839, 5.8, 4.229, 4.157, 4.627, 5.691, 4.601, 4.742, 5.733, 5.397, 3.884, 4.996, 5.304, 5.222, 4.765, 4.146, 4.717, 4.88, 4.868, 4.182, 4.214, 5.438, 3.87, 4.394, 4.404, 5.716, 5.291, 4.19, 4.712, 4.709, 4.489, 4.698, 5.193, 5.216, 4.215, 5.042, 5.089, 4.688, 4.393, 4.626, 4.88, 4.844, 4.437, 4.833, 4.622, 5.164, 4.504, 4.976, 4.508, 4.759, 4.835, 5.009, 5.693, 5.224, 4.82, 4.688, 4.918, 5.936, 5.44, 5.134, 5.993, 6.654, 5.342, 5.471, 5.812)
sydData <- c(7.319, 6.13, 6.284, 6.384, 6.602, 5.674, 5.715, 6.564, 6.602, 5.398, 7.172, 8.474, 7.012, 6.388, 6.073, 6.196, 5.633, 5.779, 5.869, 6.002, 6.202, 5.321, 5.161, 5.737, 6.168, 5.709, 5.057, 5.362, 5.902, 4.496, 5.093, 5.253, 6.832, 5.67, 5.008, 5.773, 6.529, 4.911, 4.784, 5.844, 6.252, 5.034, 5.263, 4.714, 5.362, 4.769, 4.125, 5.263, 6, 4.283, 5.256, 5.357, 6.194, 5.102, 5.596, 5.066, 6.684, 4.697, 5.366, 5.075, 5.499, 4.867, 5.71, 6.198, 6.416, 5.284, 5.483, 6.234, 6.938, 6.268, 5.562, 6.016)
vnFrame <- data.frame(Melbourne=melData, Sydney=sydData)
vn <- ts(vnFrame, start=c(1998, 1), frequency=4)
str(vn)
## Time-Series [1:72, 1:2] from 1998 to 2016: 4.87 4.11 4.42 5.17 5.55 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "Melbourne" "Sydney"
# Create three training series omitting the last 1, 2, and 3 years
train1 <- window(vn[, "Melbourne"], end = c(2014, 4))
train2 <- window(vn[, "Melbourne"], end = c(2013, 4))
train3 <- window(vn[, "Melbourne"], end = c(2012, 4))
# Produce forecasts using snaive()
fc1 <- snaive(train1, h = 4)
fc2 <- snaive(train2, h = 4)
fc3 <- snaive(train3, h = 4)
# Use accuracy() to compare the MAPE of each series
accuracy(fc1, vn[, "Melbourne"])["Test set", "MAPE"]
## [1] 5.474755
accuracy(fc2, vn[, "Melbourne"])["Test set", "MAPE"]
## [1] 12.50411
accuracy(fc3, vn[, "Melbourne"])["Test set", "MAPE"]
## [1] 7.954534
# Compute cross-validated errors for up to 8 steps ahead
e <- matrix(NA_real_, nrow = 1000, ncol = 8)
for (h in 1:8)
e[, h] <- tsCV(goog, forecastfunction = naive, h = h)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
# Plot the MSE values against the forecast horizon
data.frame(h = 1:8, MSE = mse) %>%
ggplot(aes(x = h, y = MSE)) + geom_point()
Chapter 3 - Exponential smoothing
Exponentially weighted forecasts (simple exponential smoothing):
Exponential smoothing methods with trend:
Exponential smoothing methods with trend and seasonality (commonly known as the Holt-Winters method):
State space models for exponential smoothing:
Example code includes:
# You will also use summary() and fitted(), along with autolayer() for the first time, which is like autoplot() but it adds a "layer" to a plot rather than creating a new plot.
# Here, you will apply these functions to marathon, the annual winning times in the Boston marathon from 1897-2016. The data are available in your workspace.
# Use ses() to forecast the next 10 years of winning times
data(marathon, package="fpp2")
str(marathon)
## Time-Series [1:120] from 1897 to 2016: 175 162 175 160 149 ...
fc <- ses(marathon, h = 10)
# Use summary() to see the model parameters
summary(fc)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = marathon, h = 10)
##
## Smoothing parameters:
## alpha = 0.3457
##
## Initial states:
## l = 167.1765
##
## sigma: 5.4728
##
## AIC AICc BIC
## 988.4474 988.6543 996.8099
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.8875951 5.472771 3.826287 -0.7098466 2.637645 0.8925669
## ACF1
## Training set -0.01207536
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2017 130.3562 123.3425 137.3698 119.6297 141.0826
## 2018 130.3562 122.9353 137.7771 119.0069 141.7054
## 2019 130.3562 122.5492 138.1631 118.4165 142.2958
## 2020 130.3562 122.1814 138.5309 117.8539 142.8584
## 2021 130.3562 121.8294 138.8829 117.3156 143.3967
## 2022 130.3562 121.4914 139.2209 116.7987 143.9136
## 2023 130.3562 121.1658 139.5465 116.3008 144.4116
## 2024 130.3562 120.8514 139.8610 115.8199 144.8925
## 2025 130.3562 120.5470 140.1653 115.3544 145.3580
## 2026 130.3562 120.2518 140.4605 114.9029 145.8094
# Use autoplot() to plot the forecasts
autoplot(fc)
# Add the one-step forecasts for the training data to the plot
autoplot(fc) + autolayer(fitted(fc))
# Create a training set using subset.ts()
train <- subset(marathon, end = length(marathon) - 20)
# Compute SES and naive forecasts, save to fcses and fcnaive
fcses <- ses(train, h = 20)
fcnaive <- naive(train, h = 20)
# Calculate forecast accuracy measures
accuracy(fcses, marathon)
## ME RMSE MAE MPE MAPE MASE
## Training set -1.085512 5.863790 4.155943 -0.8606360 2.827999 0.8990895
## Test set 0.457428 2.493965 1.894228 0.3171688 1.463856 0.4097941
## ACF1 Theil's U
## Training set -0.01587645 NA
## Test set -0.12556096 0.6870722
accuracy(fcnaive, marathon)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.4638047 6.904742 4.622391 -0.4086317 3.123559 1.0000000
## Test set 0.2266667 2.462113 1.846667 0.1388780 1.429608 0.3995047
## ACF1 Theil's U
## Training set -0.3589323 NA
## Test set -0.1255610 0.6799062
# Save the best forecasts as fcbest
# fcbest <- fcnaive
# Here, you will apply it to the austa series, which contains annual counts of international visitors to Australia from 1980-2015 (in millions). The data has been pre-loaded into your workspace.
# Produce 10 year forecasts of austa using holt()
data(austa, package="fpp2")
str(austa)
## Time-Series [1:36] from 1980 to 2015: 0.83 0.86 0.877 0.867 0.932 ...
fcholt <- holt(austa, h=10)
# Look at fitted model using summary()
summary(fcholt)
##
## Forecast method: Holt's method
##
## Model Information:
## Holt's method
##
## Call:
## holt(y = austa, h = 10)
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 1e-04
##
## Initial states:
## l = 0.5684
## b = 0.1755
##
## sigma: 0.1839
##
## AIC AICc BIC
## 17.08684 19.08684 25.00443
##
## Error measures:
## ME RMSE MAE MPE MAPE
## Training set -0.0006980015 0.1839059 0.1628927 -1.231661 6.322328
## MASE ACF1
## Training set 0.7994647 0.234277
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2016 7.034379 6.798694 7.270064 6.673930 7.394828
## 2017 7.209838 6.876529 7.543147 6.700086 7.719590
## 2018 7.385297 6.977065 7.793529 6.760960 8.009633
## 2019 7.560756 7.089350 8.032161 6.839804 8.281707
## 2020 7.736214 7.209144 8.263285 6.930129 8.542299
## 2021 7.911673 7.334269 8.489077 7.028610 8.794736
## 2022 8.087132 7.463435 8.710829 7.133269 9.040994
## 2023 8.262591 7.595798 8.929383 7.242820 9.282362
## 2024 8.438049 7.730775 9.145324 7.356366 9.519733
## 2025 8.613508 7.867939 9.359078 7.473258 9.753758
# Plot the forecasts
autoplot(fcholt)
# Check that the residuals look like white noise
checkresiduals(fcholt)
##
## Ljung-Box test
##
## data: Residuals from Holt's method
## Q* = 5.4561, df = 6, p-value = 0.4868
##
## Model df: 4. Total lags used: 10
# Here, you will apply hw() to a10, the monthly sales of anti-diabetic drugs in Australia from 1991 to 2008. The data are available in your workspace.
# Plot the data
data(a10, package="fpp2")
str(a10)
## Time-Series [1:204] from 1992 to 2008: 3.53 3.18 3.25 3.61 3.57 ...
autoplot(a10)
# Produce 3 year forecasts
fc <- hw(a10, seasonal = "multiplicative", h = 3)
# Check if residuals look like white noise
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from Holt-Winters' multiplicative method
## Q* = 55.57, df = 8, p-value = 3.421e-09
##
## Model df: 16. Total lags used: 24
whitenoise <- FALSE
# Plot forecasts
autoplot(fc)
# Here, you will compare an additive Holt-Winters method and a seasonal naive() method for the hyndsight data, which contains the daily pageviews on the Hyndsight blog for one year starting April 30, 2014
# Create training data with subset()
train <- subset(hyndsight, end = length(hyndsight) - 28)
# Holt-Winters additive forecasts as fchw
fchw <- hw(train, seasonal = "additive", h = 28)
# Seasonal naive forecasts as fcsn
fcsn <- snaive(train, h=28)
# Find better forecasts with accuracy()
accuracy(fchw, hyndsight)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06894788 230.0922 164.2099 -2.6329101 13.848691 0.7455151
## Test set 41.12766744 180.6719 143.8484 0.9333033 9.254484 0.6530736
## ACF1 Theil's U
## Training set 0.2145322 NA
## Test set 0.2337612 0.4233897
accuracy(fcsn, hyndsight)
## ME RMSE MAE MPE MAPE MASE
## Training set 10.50 310.3282 220.2636 -2.1239387 18.01077 1.0000000
## Test set 0.25 202.7610 160.4643 -0.6888732 10.25880 0.7285101
## ACF1 Theil's U
## Training set 0.4255730 NA
## Test set 0.3089795 0.450266
# Plot the better forecasts
autoplot(fchw)
# Fit ETS model to austa in fitaus
fitaus <- ets(austa)
# Check residuals
checkresiduals(fitaus)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,A,N)
## Q* = 5.4561, df = 6, p-value = 0.4868
##
## Model df: 4. Total lags used: 10
# Plot forecasts
autoplot(forecast(fitaus))
# Repeat for hyndsight data in fiths
fiths <- ets(hyndsight)
checkresiduals(fiths)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 65.856, df = 5, p-value = 7.444e-13
##
## Model df: 9. Total lags used: 14
autoplot(forecast(fiths))
# Which model(s) fails test? (TRUE or FALSE)
fitausfail <- FALSE
fithsfail <- TRUE
# Function to return ETS forecasts
fets <- function(y, h) {
forecast(ets(y), h = h)
}
data(qcement, package="fpp2")
str(qcement)
## Time-Series [1:233] from 1956 to 2014: 0.465 0.532 0.561 0.57 0.529 0.604 0.603 0.582 0.554 0.62 ...
cement <- window(qcement, start=1994)
str(cement)
## Time-Series [1:81] from 1994 to 2014: 1.47 1.75 1.96 1.83 1.63 ...
# Apply tsCV() for both methods
e1 <- tsCV(cement, fets, h = 4)
e2 <- tsCV(cement, snaive, h = 4)
# Compute MSE of resulting errors (watch out for missing values)
mean(e1^2, na.rm=TRUE)
## [1] 0.04442133
mean(e2^2, na.rm=TRUE)
## [1] 0.02921384
# Copy the best forecast MSE
bestmse <- mean(e2^2, na.rm=TRUE)
# Computing the ETS does not work well for all series
# Here, you will observe why it does not work well for the annual Canadian lynx population available in your workspace as lynx
# Plot the lynx series
data(lynx)
str(lynx)
## Time-Series [1:114] from 1821 to 1934: 269 321 585 871 1475 ...
autoplot(lynx)
# Use ets() to model the lynx series
fit <- ets(lynx)
# Use summary() to look at model and parameters
summary(fit)
## ETS(M,N,N)
##
## Call:
## ets(y = lynx)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 169.2223
##
## sigma: 0.9489
##
## AIC AICc BIC
## 2052.369 2052.587 2060.578
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 28.30726 1182.181 824.4878 -44.94341 94.83425 0.9923325
## ACF1
## Training set 0.3785704
# Plot 20-year forecasts of the lynx series
fit %>% forecast(h=20) %>% autoplot()
Chapter 4 - Forecasting with ARIMA Models
Transformations for variance stabilization:
ARIMA models - AutoRegressive Integrated Moving Average:
Seasonal ARIMA models - just needs a lot more differencing and lags:
Example code includes:
# Plot the series
autoplot(a10)
# Try four values of lambda in Box-Cox transformations
a10 %>% BoxCox(lambda = 0.0) %>% autoplot()
a10 %>% BoxCox(lambda = 0.1) %>% autoplot()
a10 %>% BoxCox(lambda = 0.2) %>% autoplot()
a10 %>% BoxCox(lambda = 0.3) %>% autoplot()
# Compare with BoxCox.lambda()
BoxCox.lambda(a10)
## [1] 0.1313326
# In this exercise, you will use the pre-loaded wmurders data, which contains the annual female murder rate in the US from 1950-2004
data(wmurders, package="fpp2")
# Plot the US female murder rate
autoplot(wmurders)
# Plot the differenced murder rate
autoplot(diff(wmurders))
# Plot the ACF of the differenced murder rate
ggAcf(diff(wmurders))
# In this exercise, you will use differencing and transformations simultaneously to make a time series look stationary. The data set here is h02, which contains 17 years of monthly corticosteroid drug sales in Australia
data(h02, package="fpp2")
str(h02)
## Time-Series [1:204] from 1992 to 2008: 0.43 0.401 0.432 0.493 0.502 ...
# Plot the data
autoplot(h02)
# Take logs and seasonal differences of h02
difflogh02 <- diff(log(h02), lag = 12)
# Plot difflogh02
autoplot(difflogh02)
# Take another difference and plot
ddifflogh02 <- diff(difflogh02)
autoplot(ddifflogh02)
# Plot ACF of ddifflogh02
ggAcf(ddifflogh02)
# Fit an automatic ARIMA model to the austa series
fit <- auto.arima(austa)
# Check that the residuals look like white noise
checkresiduals(fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1) with drift
## Q* = 3.2552, df = 8, p-value = 0.9173
##
## Model df: 2. Total lags used: 10
residualsok <- TRUE
# Summarize the model
summary(fit)
## Series: austa
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.3006 0.1735
## s.e. 0.1647 0.0390
##
## sigma^2 estimated as 0.03376: log likelihood=10.62
## AIC=-15.24 AICc=-14.46 BIC=-10.57
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0008313383 0.1759116 0.1520309 -1.069983 5.513269 0.7461559
## ACF1
## Training set -0.000571993
# Find the AICc value and the number of differences used
AICc <- round(fit$aicc, 2)
d <- 1
# Plot forecasts of fit
fit %>% forecast(h = 10) %>% autoplot()
# Plot forecasts from an ARIMA(0,1,1) model with no drift
austa %>% Arima(order = c(0, 1, 1), include.constant = FALSE) %>% forecast() %>% autoplot()
# Plot forecasts from an ARIMA(2,1,3) model with drift
austa %>% Arima(order = c(2, 1, 3), include.constant = TRUE) %>% forecast() %>% autoplot()
# Plot forecasts from an ARIMA(0,0,1) model with a constant
austa %>% Arima(order = c(0, 0, 1), include.constant = TRUE) %>% forecast() %>% autoplot()
# Plot forecasts from an ARIMA(0,2,1) model with no constant
austa %>% Arima(order = c(0, 2, 1), include.constant = FALSE) %>% forecast() %>% autoplot()
# Set up forecast functions for ETS and ARIMA models
fets <- function(x, h) {
forecast(ets(x), h = h)
}
farima <- function(x, h) {
forecast(auto.arima(x), h=h)
}
# Compute CV errors for ETS as e1
e1 <- tsCV(austa, fets, h=1)
# Compute CV errors for ARIMA as e2
e2 <- tsCV(austa, farima, h=1)
# Find MSE of each model class
mean(e1**2, na.rm=TRUE)
## [1] 0.05684574
mean(e2**2, na.rm=TRUE)
## [1] 0.04336277
# Plot 10-year forecasts using the best model class
austa %>% farima(h=10) %>% autoplot()
# Check that the logged h02 data have stable variance
h02 %>% log() %>% autoplot()
# Fit a seasonal ARIMA model to h02 with lambda = 0
fit <- auto.arima(h02, lambda=0)
# Summarize the fitted model
summary(fit)
## Series: h02
## ARIMA(2,1,3)(0,1,1)[12]
## Box Cox transformation: lambda= 0
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 sma1
## -1.0194 -0.8351 0.1717 0.2578 -0.4206 -0.6528
## s.e. 0.1648 0.1203 0.2079 0.1177 0.1060 0.0657
##
## sigma^2 estimated as 0.004203: log likelihood=250.8
## AIC=-487.6 AICc=-486.99 BIC=-464.83
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set -0.003823286 0.05006017 0.03588577 -0.643286 4.52991
## MASE ACF1
## Training set 0.5919957 -0.007519928
# Record the amount of lag-1 differencing and seasonal differencing used
d <- 1
D <- 1
# Plot 2-year forecasts
fit %>% forecast(h=24) %>% autoplot()
# Find an ARIMA model for euretail
data(euretail, package="fpp2")
str(euretail)
## Time-Series [1:64] from 1996 to 2012: 89.1 89.5 89.9 90.1 89.2 ...
fit1 <- auto.arima(euretail)
# Don't use a stepwise search
fit2 <- auto.arima(euretail, stepwise=FALSE)
# AICc of better model
AICc <- round(min(fit1$aicc, fit2$aicc), 2)
# Compute 2-year forecasts from better model
fit2 %>% forecast(h=8) %>% autoplot()
# In the final exercise for this chapter, you will compare seasonal ARIMA and ETS models applied to the quarterly cement production data qcement
# Because the series is very long, you can afford to use a training and test set rather than time series cross-validation. This is much faster
# Use 20 years of the qcement data beginning in 1988
train <- window(qcement, start = c(1988, 1), end = c(2007, 4))
# Fit an ARIMA and an ETS model to the training data
fit1 <- auto.arima(train)
fit2 <- ets(train)
# Check that both models have white noise residuals
checkresiduals(fit1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0)(2,1,1)[4] with drift
## Q* = 3.5497, df = 3, p-value = 0.3144
##
## Model df: 6. Total lags used: 9
checkresiduals(fit2)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 7.1565, df = 3, p-value = 0.06707
##
## Model df: 6. Total lags used: 9
# Produce forecasts for each model
fc1 <- forecast(fit1, h = length(window(qcement, start=2008)))
fc2 <- forecast(fit2, h = length(window(qcement, start=2008)))
# Use accuracy() to find better model based on RMSE
accuracy(fc1, qcement)
## ME RMSE MAE MPE MAPE
## Training set -0.005847264 0.1005547 0.07954692 -0.6550065 4.347836
## Test set -0.158355693 0.1978197 0.16749405 -7.3004639 7.653446
## MASE ACF1 Theil's U
## Training set 0.5434705 -0.01315073 NA
## Test set 1.1443319 0.28848376 0.7226779
accuracy(fc2, qcement)
## ME RMSE MAE MPE MAPE
## Training set 0.01497655 0.1017825 0.07904747 0.5666991 4.332964
## Test set -0.13546124 0.1836477 0.15332875 -6.2632105 6.954592
## MASE ACF1 Theil's U
## Training set 0.5400583 -0.03949197 NA
## Test set 1.0475535 0.54271907 0.6810648
# bettermodel <- fc2
Chapter 5 - Advanced Methods
Dynamic Regression - could include factors like advertising or competition in a single model:
Dynamic Harmonic Regression - handling periodic seasonality with Fourier terms:
args(tslm)
TBATS models - combines many models in to a single model:
Wrap up:
Example code includes:
# In this exercise, you will model sales data regressed against advertising expenditure, with an ARMA error to account for any serial correlation in the regression errors
# The data are available in your workspace as advert and comprise 24 months of sales and advertising expenditure for an automotive parts company
data(advert, package="fma")
str(advert)
## Time-Series [1:24, 1:2] from 1 to 24: 25 0 15 10 20 10 5 5 15 15 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "advert" "sales"
# Time plot of both variables
autoplot(advert, facets=TRUE)
# Fit ARIMA model
fit <- auto.arima(advert[, "sales"], xreg = advert[, "advert"], stationary = TRUE)
# Check model. Increase in sales for each unit increase in advertising
salesincrease <- coefficients(fit)[3]
# Forecast fit as fc
fc <- forecast(fit, xreg = rep(10, 6))
# Plot fc with x and y labels
autoplot(fc) + xlab("Month") + ylab("Sales")
data(elecdaily, package="fpp2")
str(elecdaily)
## Time-Series [1:365, 1:3] from 1 to 53: 175 189 189 174 170 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:3] "Demand" "WorkDay" "Temperature"
elec <- elecdaily
colnames(elec)[2] <- "Workday"
# Time plots of demand and temperatures
autoplot(elec[, c("Demand", "Temperature")], facets = TRUE)
# Matrix of regressors
xreg <- cbind(MaxTemp = elec[, "Temperature"],
MaxTempSq = elec[, "Temperature"] ** 2,
Workday = elec[, "Workday"])
# Fit model
fit <- auto.arima(elec[, "Demand"], xreg = xreg)
# Forecast fit one day ahead
forecast(fit, xreg = cbind(20, 20**2, 1))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 53.14286 185.4008 176.9271 193.8745 172.4414 198.3602
# The pre-loaded gasoline data comprises weekly data on US finished motor gasoline products
# In this exercise, you will fit a harmonic regression to this data set and forecast the next 3 years
data(gasoline, package="fpp2")
str(gasoline)
## Time-Series [1:1355] from 1991 to 2017: 6621 6433 6582 7224 6875 ...
# Set up harmonic regressors of order 13
harmonics <- fourier(gasoline, K = 13)
# Fit regression model with ARIMA errors
fit <- auto.arima(gasoline, xreg = harmonics, seasonal = FALSE)
# Forecasts next 3 years
newharmonics <- fourier(gasoline, K = 13, h = 156)
fc <- forecast(fit, xreg = newharmonics)
# Plot forecasts fc
autoplot(fc)
# Fit a harmonic regression using order 10 for each type of seasonality
fit <- tslm(taylor ~ fourier(taylor, K = c(10, 10)))
# Forecast 20 working days ahead
fc <- forecast(fit, newdata = data.frame(fourier(taylor, K = c(10, 10), h = 20 * 48)))
# Plot the forecasts
autoplot(fc)
# Check the residuals of fit
checkresiduals(fit)
##
## Breusch-Godfrey test for serial correlation of order up to 672
##
## data: Residuals from Linear regression model
## LM test = 3938.9, df = 672, p-value < 2.2e-16
# Another time series with multiple seasonal periods is calls, which contains 20 consecutive days of 5-minute call volume data for a large North American bank
# There are 169 5-minute periods in a working day, and so the weekly seasonal frequency is 5 x 169 = 845
# The weekly seasonality is relatively weak, so here you will just model daily seasonality. calls is pre-loaded into your workspace
# The residuals in this case still fail the white noise tests, but their autocorrelations are tiny, even though they are significant
# This is because the series is so long. It is often unrealistic to have residuals that pass the tests for such long series
# The effect of the remaining correlations on the forecasts will be negligible
data(calls, package="fpp2")
calls <- window(calls, start=29.8)
str(calls)
## Time-Series [1:3380] from 29.8 to 33.8: 98 83 89 87 71 85 76 81 86 94 ...
## - attr(*, "names")= chr [1:3380] "X26.09.20031" "X26.09.20032" "X26.09.20033" "X26.09.20034" ...
## - attr(*, "msts")= num [1:2] 169 845
# Plot the calls data
autoplot(calls)
# Set up the xreg matrix
xreg <- fourier(calls, K = c(10, 0))
# Fit a dynamic regression model
fit <- auto.arima(calls, xreg = xreg, seasonal=FALSE, stationary=TRUE)
# Check the residuals
checkresiduals(fit)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,0,1) errors
## Q* = 1843.9, df = 1665, p-value = 0.001318
##
## Model df: 25. Total lags used: 1690
# Plot forecasts for 10 working days ahead
fc <- forecast(fit, xreg = fourier(calls, c(10, 0), h = 10 * 169))
autoplot(fc)
# The gas data contains Australian monthly gas production
# A plot of the data shows the variance has changed a lot over time, so it needs a transformation
# The seasonality has also changed shape over time, and there is a strong trend
# This makes it an ideal series to test the tbats() function which is designed to handle these features
# Plot the gas data
autoplot(gas)
# Fit a TBATS model to the gas data
fit <- tbats(gas)
# Forecast the series for the next 5 years
fc <- forecast(fit, h=60)
# Plot the forecasts
autoplot(fc)
# Record the Box-Cox parameter and the order of the Fourier terms
lambda <- round(as.vector(fc$model$lambda), 3) # 0.082
K <- fc$model$k.vector #5
Chapter 1 - Introduction to Networks
What are social networks?
Network attributes - may want to add information about vertices and edges:
Network visualization principles - many options for creating and customizing the display:
Example code includes:
# Load igraph
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
friends <- readr::read_csv("./RInputFiles/friends.csv")
## Parsed with column specification:
## cols(
## name1 = col_character(),
## name2 = col_character()
## )
str(friends)
## Classes 'tbl_df', 'tbl' and 'data.frame': 27 obs. of 2 variables:
## $ name1: chr "Jessie" "Jessie" "Sidney" "Sidney" ...
## $ name2: chr "Sidney" "Britt" "Britt" "Donnie" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ name1: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ name2: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Inspect the first few rows of the dataframe 'friends'
head(friends)
## # A tibble: 6 x 2
## name1 name2
## <chr> <chr>
## 1 Jessie Sidney
## 2 Jessie Britt
## 3 Sidney Britt
## 4 Sidney Donnie
## 5 Karl Berry
## 6 Sidney Rene
# Convert friends dataframe to a matrix
friends.mat <- as.matrix(friends)
# Convert friends matrix to an igraph object
g <- graph.edgelist(friends.mat, directed = FALSE)
# Make a very basic plot of the network
plot(g)
# Subset vertices and edges
V(g)
## + 16/16 vertices, named, from ebe8820:
## [1] Jessie Sidney Britt Donnie Karl Berry Rene Shayne
## [9] Elisha Whitney Odell Lacy Eugene Jude Rickie Tommy
E(g)
## + 27/27 edges from ebe8820 (vertex names):
## [1] Jessie --Sidney Jessie --Britt Sidney --Britt Sidney --Donnie
## [5] Karl --Berry Sidney --Rene Britt --Rene Sidney --Shayne
## [9] Sidney --Elisha Sidney --Whitney Jessie --Whitney Donnie --Odell
## [13] Sidney --Odell Rene --Whitney Donnie --Shayne Jessie --Lacy
## [17] Rene --Lacy Elisha --Eugene Eugene --Jude Berry --Odell
## [21] Odell --Rickie Karl --Odell Britt --Lacy Elisha --Jude
## [25] Whitney--Lacy Britt --Whitney Karl --Tommy
# Count number of edges
gsize(g)
## [1] 27
# Count number of vertices
gorder(g)
## [1] 16
# Inspect the objects 'genders' and 'ages'
genders <- c('M', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M')
ages <- c(18, 19, 21, 20, 22, 18, 23, 21, 22, 20, 20, 22, 21, 18, 19, 20)
# Create new vertex attribute called 'gender'
g <- set_vertex_attr(g, "gender", value = genders)
# Create new vertex attribute called 'age'
g <- set_vertex_attr(g, "age", value = ages)
# View all vertex attributes in a list
vertex_attr(g)
## $name
## [1] "Jessie" "Sidney" "Britt" "Donnie" "Karl" "Berry" "Rene"
## [8] "Shayne" "Elisha" "Whitney" "Odell" "Lacy" "Eugene" "Jude"
## [15] "Rickie" "Tommy"
##
## $gender
## [1] "M" "F" "F" "M" "M" "M" "F" "M" "M" "F" "M" "F" "M" "F" "M" "M"
##
## $age
## [1] 18 19 21 20 22 18 23 21 22 20 20 22 21 18 19 20
# View attributes of first five vertices in a dataframe
V(g)[[1:5]]
## + 5/16 vertices, named, from ebe8820:
## name gender age
## 1 Jessie M 18
## 2 Sidney F 19
## 3 Britt F 21
## 4 Donnie M 20
## 5 Karl M 22
# View hours
hours <- c(1, 2, 2, 1, 2, 5, 5, 1, 1, 3, 2, 1, 1, 5, 1, 2, 4, 1, 3, 1, 1, 1, 4, 1, 3, 3, 4)
# Create new edge attribute called 'hours'
g <- set_edge_attr(g, "hours", value = hours)
# View edge attributes of graph object
edge_attr(g)
## $hours
## [1] 1 2 2 1 2 5 5 1 1 3 2 1 1 5 1 2 4 1 3 1 1 1 4 1 3 3 4
# Find all edges that include "Britt"
E(g)[[inc('Britt')]]
## + 5/27 edges from ebe8820 (vertex names):
## tail head tid hid hours
## 2 Jessie Britt 1 3 2
## 3 Sidney Britt 2 3 2
## 7 Britt Rene 3 7 5
## 23 Britt Lacy 3 12 4
## 26 Britt Whitney 3 10 3
# Find all pairs that spend 4 or more hours together per week
E(g)[[hours>=4]]
## + 6/27 edges from ebe8820 (vertex names):
## tail head tid hid hours
## 6 Sidney Rene 2 7 5
## 7 Britt Rene 3 7 5
## 14 Rene Whitney 7 10 5
## 17 Rene Lacy 7 12 4
## 23 Britt Lacy 3 12 4
## 27 Karl Tommy 5 16 4
friends1_nodes <- readr::read_csv("./RInputFiles/friends1_nodes.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## gender = col_character()
## )
str(friends1_nodes)
## Classes 'tbl_df', 'tbl' and 'data.frame': 19 obs. of 2 variables:
## $ name : chr "Joe" "Erin" "Kelley" "Ronald" ...
## $ gender: chr "M" "F" "F" "M" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ name : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ gender: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
friends1_edges <- readr::read_csv("./RInputFiles/friends1_edges.csv")
## Parsed with column specification:
## cols(
## name1 = col_character(),
## name2 = col_character(),
## hours = col_integer()
## )
str(friends1_edges)
## Classes 'tbl_df', 'tbl' and 'data.frame': 25 obs. of 3 variables:
## $ name1: chr "Joe" "Joe" "Joe" "Erin" ...
## $ name2: chr "Ronald" "Michael" "Troy" "Kelley" ...
## $ hours: int 1 3 2 3 5 1 3 5 2 1 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 3
## .. ..$ name1: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ name2: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ hours: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Create an igraph object with attributes directly from dataframes
g1 <- graph_from_data_frame(d = friends1_edges, vertices = friends1_nodes, directed = FALSE)
# Subset edges greater than or equal to 5 hours
E(g1)[[hours >= 5]]
## + 4/25 edges from ec061c9 (vertex names):
## tail head tid hid hours
## 5 Kelley Valentine 3 6 5
## 8 Ronald Jasmine 4 8 5
## 12 Valentine Perry 6 15 5
## 15 Jasmine Juan 8 9 6
# Plot network and color vertices by gender
V(g1)$color <- ifelse(V(g1)$gender == "F", "orange", "dodgerblue")
plot(g1, vertex.label.color = "black")
# Plot the graph object g1 in a circle layout
plot(g1, vertex.label.color = "black", layout = layout_in_circle(g1))
# Plot the graph object g1 in a Fruchterman-Reingold layout
plot(g1, vertex.label.color = "black", layout = layout_with_fr(g1))
# Plot the graph object g1 in a Tree layout
m <- layout_as_tree(g1)
plot(g1, vertex.label.color = "black", layout = m)
# Plot the graph object g1 using igraph's chosen layout
m1 <- layout_nicely(g1)
plot(g1, vertex.label.color = "black", layout = m1)
# Create a vector of weights based on the number of hours each pair spend together
w1 <- E(g1)$hours
# Plot the network varying edges by weights
m1 <- layout_nicely(g1)
plot(g1,
vertex.label.color = "black",
edge.color = 'black',
edge.width = w1,
layout = m1)
# Create a new igraph object only including edges from the original graph that are greater than 2 hours long
g2 <- delete_edges(g1, E(g1)[hours < 2])
# Plot the new graph
w2 <- E(g2)$hours
m2 <- layout_nicely(g2)
plot(g2,
vertex.label.color = "black",
edge.color = 'black',
edge.width = w2,
layout = m2)
Chapter 2 - Identifying Important Vertices in a Network
Directed Networks - arrows represent the from-to relationship, such as e-mail exchanges:
Relationship between Vertices - overall patterns between networks (neighbors and paths):
Significant nodes in a network:
Example code includes:
measles <- readr::read_csv("./RInputFiles/measles.csv")
## Parsed with column specification:
## cols(
## from = col_integer(),
## to = col_integer()
## )
str(measles)
## Classes 'tbl_df', 'tbl' and 'data.frame': 184 obs. of 2 variables:
## $ from: int 45 45 172 180 45 180 42 45 182 45 ...
## $ to : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ from: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ to : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Get the graph object
g <- graph_from_data_frame(measles, directed = TRUE)
# is the graph directed?
is.directed(g)
## [1] TRUE
# Is the graph weighted?
is.weighted(g)
## [1] FALSE
# Where does each edge originate from?
table(head_of(g, E(g)))
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 56 57 58 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 184 185 186 187
## 1 1 1 1
# Make a basic plot
plot(g,
vertex.label.color = "black",
edge.color = 'gray77',
vertex.size = 0,
edge.arrow.size = 0.1,
layout = layout_nicely(g))
# Is there an edge going from vertex 184 to vertex 178?
g['184', '178']
## [1] 1
# Is there an edge going from vertex 178 to vertex 184?
g['178', '184']
## [1] 0
# Show all edges going to or from vertex 184
incident(g, '184', mode = c("all"))
## + 6/184 edges from ecb34f1 (vertex names):
## [1] 184->45 184->182 184->181 184->178 184->183 184->177
# Show all edges going out from vertex 184
incident(g, '184', mode = c("out"))
## + 6/184 edges from ecb34f1 (vertex names):
## [1] 184->45 184->182 184->181 184->178 184->183 184->177
# Identify all neighbors of vertex 12 regardless of direction
neighbors(g, '12', mode = c('all'))
## + 5/187 vertices, named, from ecb34f1:
## [1] 45 13 72 89 109
# Identify other vertices that direct edges towards vertex 12
neighbors(g, '12', mode = c('in'))
## + 1/187 vertex, named, from ecb34f1:
## [1] 45
# Identify any vertices that receive an edge from vertex 42 and direct an edge to vertex 124
n1 <- neighbors(g, '42', mode = c('out'))
n2 <- neighbors(g, '124', mode = c('in'))
intersection(n1, n2)
## + 1/187 vertex, named, from ecb34f1:
## [1] 7
# Which two vertices are the furthest apart in the graph ?
farthest_vertices(g)
## $vertices
## + 2/187 vertices, named, from ecb34f1:
## [1] 184 162
##
## $distance
## [1] 5
# Shows the path sequence between two furthest apart vertices.
get_diameter(g)
## + 6/187 vertices, named, from ecb34f1:
## [1] 184 178 42 7 123 162
# Identify vertices that are reachable within two connections from vertex 42
ego(g, 2, '42', mode = c('out'))
## [[1]]
## + 13/187 vertices, named, from ecb34f1:
## [1] 42 7 106 43 123 101 120 124 125 128 129 108 127
# Identify vertices that can reach vertex 42 within two connections
ego(g, 2, '42', mode = c('in'))
## [[1]]
## + 3/187 vertices, named, from ecb34f1:
## [1] 42 178 184
# Calculate the out-degree of each vertex
g.outd <- degree(g, mode = c("out"))
# View a summary of out-degree
table(g.outd)
## g.outd
## 0 1 2 3 4 6 7 8 30
## 125 21 16 12 6 2 3 1 1
# Make a histogram of out-degrees
hist(g.outd, breaks = 30)
# Find the vertex that has the maximum out-degree
which.max(g.outd)
## 45
## 1
# Calculate betweenness of each vertex
g.b <- betweenness(g, directed = TRUE)
# Show histogram of vertex betweenness
hist(g.b, breaks = 80)
# Create plot with vertex size determined by betweenness score
plot(g,
vertex.label = NA,
edge.color = 'black',
vertex.size = sqrt(g.b)+1,
edge.arrow.size = 0.05,
layout = layout_nicely(g))
# Make an ego graph
g184 <- make_ego_graph(g, diameter(g), nodes = '184', mode = c("all"))[[1]]
# Get a vector of geodesic distances of all vertices from vertex 184
dists <- distances(g184, "184")
# Create a color palette of length equal to the maximal geodesic distance plus one.
colors <- c("black", "red", "orange", "blue", "dodgerblue", "cyan")
# Set color attribute to vertices of network g184.
V(g184)$color <- colors[dists+1]
# Visualize the network based on geodesic distance from vertex 184 (patient zero).
plot(g184,
vertex.label = dists,
vertex.label.color = "white",
vertex.label.cex = .6,
edge.color = 'black',
vertex.size = 7,
edge.arrow.size = .05,
main = "Geodesic Distances from Patient Zero"
)
Chapter 3 - Characterizing Network Structures
Introduction - Forrest Gump network dataset analysis:
Understanding network structures:
Network Substructures:
Example code includes:
# In this chapter you will use a social network based on the movie Forrest Gump
# Each edge of the network indicates that those two characters were in at least one scene of the movie together
# Therefore this network is undirected
# To familiarize yourself with the network, you will first create the network object from the raw dataset
# Then, you will identify key vertices using a measure called eigenvector centrality
# Individuals with high eigenvector centrality are those that are highly connected to other highly connected individuals
# You will then make an exploratory visualization of the network
gump <- readr::read_csv("./RInputFiles/gump.csv")
## Parsed with column specification:
## cols(
## V1 = col_character(),
## V2 = col_character()
## )
str(gump)
## Classes 'tbl_df', 'tbl' and 'data.frame': 271 obs. of 2 variables:
## $ V1: chr "ABBIE HOFFMAN" "ABBIE HOFFMAN" "ANCHORMAN" "ANCHORMAN" ...
## $ V2: chr "JENNY" "POLICEMAN" "FORREST" "LT DAN" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ V1: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ V2: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Inspect Forrest Gump Movie dataset
head(gump)
## # A tibble: 6 x 2
## V1 V2
## <chr> <chr>
## 1 ABBIE HOFFMAN JENNY
## 2 ABBIE HOFFMAN POLICEMAN
## 3 ANCHORMAN FORREST
## 4 ANCHORMAN LT DAN
## 5 ANCHORMAN MARGO
## 6 ANCHORMAN MRS GUMP
# Make an undirected network
g <- graph_from_data_frame(gump, directed = FALSE)
# Identify key nodes using eigenvector centrality
g.ec <- eigen_centrality(g)
which.max(g.ec$vector)
## FORREST
## 36
# Plot Forrest Gump Network
plot(g, vertex.label.color = "black", vertex.label.cex = 0.6, vertex.size = 25*(g.ec$vector),
edge.color = 'gray88', main = "Forrest Gump Network"
)
# Get density of a graph
gd <- edge_density(g)
# Get the diameter of the graph g
diameter(g, directed = FALSE)
## [1] 4
# Get the average path length of the graph g
g.apl <- mean_distance(g, directed = FALSE)
g.apl
## [1] 1.994967
# Create one random graph with the same number of nodes and edges as g
g.random <- erdos.renyi.game(n = gorder(g), p.or.m = edge_density(g), type = "gnp")
g.random
## IGRAPH edb5f5c U--- 94 279 -- Erdos renyi (gnp) graph
## + attr: name (g/c), type (g/c), loops (g/l), p (g/n)
## + edges from edb5f5c:
## [1] 1-- 8 4-- 9 5--10 6--10 7--11 3--13 12--13 2--14 4--14 5--14
## [11] 6--14 11--14 8--15 4--16 4--17 2--18 1--20 6--21 2--22 13--22
## [21] 17--22 15--24 5--25 8--25 9--25 25--26 5--27 16--27 3--28 8--29
## [31] 27--29 11--30 16--30 17--30 20--30 22--30 23--30 12--31 15--31 19--31
## [41] 26--31 6--32 17--32 21--32 11--33 4--34 7--34 26--34 1--35 8--36
## [51] 10--36 26--36 3--37 14--37 19--38 32--38 37--38 2--39 2--40 7--40
## [61] 33--40 6--41 10--42 18--42 24--42 30--42 16--43 24--43 31--43 8--44
## [71] 34--44 34--45 40--45 5--47 27--47 42--47 35--48 45--48 2--49 2--50
## + ... omitted several edges
plot(g.random)
# Get density of new random graph `g.random`
edge_density(g.random)
## [1] 0.06382979
#Get the average path length of the random graph g.random
mean_distance(g.random, directed = FALSE)
## [1] 2.702197
g.apl=mean_distance(g)
# Generate 1000 random graphs
gl <- vector('list', 1000)
for(i in 1:1000){
gl[[i]] <- erdos.renyi.game(n = gorder(g), p.or.m = gd, type = "gnp")
}
# Calculate average path length of 1000 random graphs
gl.apl <- lapply(gl, FUN=mean_distance, directed = FALSE)
gl.apls <- unlist(gl.apl)
# Plot the distribution of average path lengths
hist(gl.apls, xlim = range(c(1.5, 6)))
abline(v = g.apl, col = "red", lty = 3, lwd=2)
# Calculate the proportion of graphs with an average path length lower than our observed
sum(gl.apls < g.apl)/1000
## [1] 0
# Show all triangles in the network.
matrix(triangles(g), nrow = 3)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 36 36 36 36 36 36 36 36 36 36 36 36 36
## [2,] 1 1 1 1 2 4 4 6 6 6 6 7 7
## [3,] 83 38 39 66 68 57 24 27 75 40 45 8 69
## [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 36 36 36 36 36 36 36 36 36 36 36
## [2,] 8 11 11 11 12 12 13 14 14 14 14
## [3,] 69 12 13 70 70 13 70 4 19 24 71
## [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
## [1,] 36 36 36 36 36 36 36 36 36 36 36
## [2,] 14 14 14 14 14 15 15 17 17 18 18
## [3,] 65 57 62 63 64 21 72 22 42 5 28
## [,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46]
## [1,] 36 36 36 36 36 36 36 36 36 36 36
## [2,] 19 19 21 22 24 26 26 26 26 26 26
## [3,] 71 63 72 42 57 73 52 47 48 49 50
## [,47] [,48] [,49] [,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57]
## [1,] 36 36 36 36 36 36 36 36 36 36 36
## [2,] 27 27 27 28 28 30 30 30 34 38 38
## [3,] 75 45 40 5 90 84 61 51 88 83 66
## [,58] [,59] [,60] [,61] [,62] [,63] [,64] [,65] [,66] [,67] [,68]
## [1,] 36 36 36 36 36 36 36 36 36 36 36
## [2,] 38 39 39 40 40 41 41 41 41 41 41
## [3,] 39 83 66 75 45 1 3 6 7 8 11
## [,69] [,70] [,71] [,72] [,73] [,74] [,75] [,76] [,77] [,78] [,79]
## [1,] 36 36 36 36 36 36 36 36 36 36 36
## [2,] 41 41 41 41 41 41 41 41 41 41 41
## [3,] 12 13 26 27 30 32 33 86 37 38 39
## [,80] [,81] [,82] [,83] [,84] [,85] [,86] [,87] [,88] [,89] [,90]
## [1,] 36 36 36 36 36 36 36 36 36 36 36
## [2,] 41 41 41 41 41 41 41 41 41 41 41
## [3,] 40 43 44 45 47 48 49 50 51 52 53
## [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98] [,99] [,100] [,101]
## [1,] 36 36 36 36 36 36 36 36 36 36 36
## [2,] 41 41 41 41 41 41 41 41 41 41 41
## [3,] 54 56 58 61 66 69 70 73 74 75 79
## [,102] [,103] [,104] [,105] [,106] [,107] [,108] [,109] [,110] [,111]
## [1,] 36 36 36 36 36 36 36 36 36 36
## [2,] 41 41 41 43 43 43 44 44 44 44
## [3,] 82 83 84 82 54 53 2 3 9 14
## [,112] [,113] [,114] [,115] [,116] [,117] [,118] [,119] [,120] [,121]
## [1,] 36 36 36 36 36 36 36 36 36 36
## [2,] 44 44 44 44 44 44 44 44 44 44
## [3,] 17 19 22 82 71 42 43 53 62 63
## [,122] [,123] [,124] [,125] [,126] [,127] [,128] [,129] [,130] [,131]
## [1,] 36 36 36 36 36 36 36 36 36 36
## [2,] 44 44 45 47 47 47 47 47 48 48
## [3,] 64 65 75 73 52 50 48 49 73 52
## [,132] [,133] [,134] [,135] [,136] [,137] [,138] [,139] [,140] [,141]
## [1,] 36 36 36 36 36 36 36 36 36 36
## [2,] 48 48 49 49 49 50 50 51 51 52
## [3,] 50 49 73 52 50 73 52 84 61 73
## [,142] [,143] [,144] [,145] [,146] [,147] [,148] [,149] [,150] [,151]
## [1,] 36 36 36 36 36 36 36 36 36 36
## [2,] 53 54 54 56 58 59 60 60 60 60
## [3,] 82 87 56 89 79 92 2 20 23 25
## [,152] [,153] [,154] [,155] [,156] [,157] [,158] [,159] [,160] [,161]
## [1,] 36 36 36 36 36 36 36 36 36 36
## [2,] 60 60 60 61 62 62 62 62 63 64
## [3,] 31 81 43 84 71 19 35 63 71 3
## [,162] [,163] [,164] [,165] [,166] [,167] [,168] [,169] [,170] [,171]
## [1,] 36 36 36 36 36 36 36 36 36 36
## [2,] 64 64 64 64 64 65 65 65 65 65
## [3,] 71 19 63 62 46 4 71 19 24 64
## [,172] [,173] [,174] [,175] [,176] [,177] [,178] [,179] [,180] [,181]
## [1,] 36 36 36 36 41 41 41 41 41 41
## [2,] 65 65 65 66 1 1 1 1 6 6
## [3,] 63 57 62 83 83 38 39 66 27 75
## [,182] [,183] [,184] [,185] [,186] [,187] [,188] [,189] [,190] [,191]
## [1,] 41 41 41 41 41 41 41 41 41 41
## [2,] 6 6 7 7 8 11 11 11 12 12
## [3,] 40 45 8 69 69 12 13 70 70 13
## [,192] [,193] [,194] [,195] [,196] [,197] [,198] [,199] [,200] [,201]
## [1,] 41 41 41 41 41 41 41 41 41 41
## [2,] 13 26 26 26 26 26 26 27 27 27
## [3,] 70 73 52 47 48 49 50 75 45 40
## [,202] [,203] [,204] [,205] [,206] [,207] [,208] [,209] [,210] [,211]
## [1,] 41 41 41 41 41 41 41 41 41 41
## [2,] 30 30 30 38 38 38 39 39 40 40
## [3,] 84 61 51 83 66 39 83 66 75 45
## [,212] [,213] [,214] [,215] [,216] [,217] [,218] [,219] [,220] [,221]
## [1,] 41 41 41 41 41 41 41 41 41 41
## [2,] 43 43 43 44 44 44 44 45 47 47
## [3,] 82 54 53 3 82 43 53 75 73 52
## [,222] [,223] [,224] [,225] [,226] [,227] [,228] [,229] [,230] [,231]
## [1,] 41 41 41 41 41 41 41 41 41 41
## [2,] 47 47 47 48 48 48 48 49 49 49
## [3,] 50 48 49 73 52 50 49 73 52 50
## [,232] [,233] [,234] [,235] [,236] [,237] [,238] [,239] [,240] [,241]
## [1,] 41 41 41 41 41 41 41 41 41 41
## [2,] 50 50 51 51 52 53 54 58 58 61
## [3,] 73 52 84 61 73 82 56 10 79 84
## [,242] [,243] [,244] [,245] [,246] [,247] [,248] [,249] [,250] [,251]
## [1,] 41 44 44 44 44 44 44 44 44 44
## [2,] 66 2 14 14 14 14 14 14 17 17
## [3,] 83 67 19 71 65 62 63 64 22 42
## [,252] [,253] [,254] [,255] [,256] [,257] [,258] [,259] [,260] [,261]
## [1,] 44 44 44 44 44 44 44 44 44 44
## [2,] 19 19 22 43 43 53 62 62 62 63
## [3,] 71 63 42 82 53 82 71 19 63 71
## [,262] [,263] [,264] [,265] [,266] [,267] [,268] [,269] [,270] [,271]
## [1,] 44 44 44 44 44 44 44 44 44 44
## [2,] 64 64 64 64 64 65 65 65 65 65
## [3,] 3 71 19 63 62 71 19 64 63 62
## [,272] [,273] [,274] [,275] [,276] [,277] [,278] [,279] [,280] [,281]
## [1,] 14 14 14 14 14 14 14 14 14 14
## [2,] 4 4 19 19 24 65 65 65 65 65
## [3,] 57 24 71 63 57 4 71 19 24 64
## [,282] [,283] [,284] [,285] [,286] [,287] [,288] [,289] [,290] [,291]
## [1,] 14 14 14 14 14 14 14 14 14 14
## [2,] 65 65 65 62 62 62 63 64 64 64
## [3,] 63 57 62 71 19 63 71 71 19 63
## [,292] [,293] [,294] [,295] [,296] [,297] [,298] [,299] [,300] [,301]
## [1,] 14 65 65 65 65 65 65 65 65 65
## [2,] 64 4 4 19 19 24 64 64 64 64
## [3,] 62 57 24 71 63 57 71 19 63 62
## [,302] [,303] [,304] [,305] [,306] [,307] [,308] [,309] [,310] [,311]
## [1,] 65 65 65 65 64 64 64 64 64 64
## [2,] 63 62 62 62 19 19 63 62 62 62
## [3,] 71 71 19 63 71 63 71 71 19 63
## [,312] [,313] [,314] [,315] [,316] [,317] [,318] [,319] [,320] [,321]
## [1,] 62 62 62 19 26 26 26 26 26 26
## [2,] 19 19 63 63 52 47 47 47 47 47
## [3,] 71 63 71 71 73 73 52 50 48 49
## [,322] [,323] [,324] [,325] [,326] [,327] [,328] [,329] [,330] [,331]
## [1,] 26 26 26 26 26 26 26 26 26 47
## [2,] 48 48 48 48 49 49 49 50 50 52
## [3,] 73 52 50 49 73 52 50 73 52 73
## [,332] [,333] [,334] [,335] [,336] [,337] [,338] [,339] [,340] [,341]
## [1,] 47 47 47 47 47 47 47 47 47 48
## [2,] 50 50 48 48 48 48 49 49 49 52
## [3,] 73 52 73 52 50 49 73 52 50 73
## [,342] [,343] [,344] [,345] [,346] [,347] [,348] [,349] [,350] [,351]
## [1,] 48 48 48 48 48 49 49 49 50 43
## [2,] 50 50 49 49 49 52 50 50 52 53
## [3,] 73 52 73 52 50 73 73 52 73 82
## [,352] [,353] [,354] [,355] [,356] [,357] [,358] [,359] [,360] [,361]
## [1,] 1 1 1 1 1 1 6 6 6 6
## [2,] 38 38 38 39 39 66 27 27 27 40
## [3,] 83 66 39 83 66 83 75 45 40 75
## [,362] [,363] [,364] [,365] [,366] [,367] [,368] [,369] [,370] [,371]
## [1,] 6 6 27 27 27 38 38 38 39 40
## [2,] 40 45 45 40 40 66 39 39 66 45
## [3,] 45 75 75 75 45 83 83 66 83 75
## [,372] [,373] [,374] [,375] [,376] [,377] [,378] [,379] [,380] [,381]
## [1,] 4 11 11 11 12 30 30 30 51 7
## [2,] 24 12 12 13 13 61 51 51 61 8
## [3,] 57 70 13 70 70 84 84 61 84 69
## [,382] [,383] [,384]
## [1,] 17 18 15
## [2,] 22 28 21
## [3,] 42 5 72
# Count the number of triangles that vertex "BUBBA" is in.
count_triangles(g, vids='BUBBA')
## [1] 37
# Calculate the global transitivity of the network.
g.tr <- transitivity(g)
g.tr
## [1] 0.1918082
# Calculate the local transitivity for vertex BUBBA.
transitivity(g, vids='BUBBA', type = "local")
## [1] 0.6727273
# One thousand random networks are stored in the list object gl
g.tr <- 0.1918082 # Calculate the proportion of random graphs that have a transitivity higher than the transitivity of Forrest Gump's network, which you previously calculated and assigned to g.tr
# Calculate average transitivity of 1000 random graphs
gl.tr <- lapply(gl, FUN=transitivity)
gl.trs <- unlist(gl.tr)
# Get summary statistics of transitivity scores
summary(gl.trs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02488 0.05418 0.06119 0.06138 0.06835 0.09910
# Calculate the proportion of graphs with a transitivity score higher than Forrest Gump's network.
sum(gl.trs > g.tr)/1000
## [1] 0
# Identify the largest cliques in the network
largest_cliques(g)
## [[1]]
## + 9/94 vertices, named, from ed8ebe2:
## [1] FORREST STRONGARM BUBBA DALLAS LT DAN MAN SGT SIMS
## [8] SOLDIER SONG
##
## [[2]]
## + 9/94 vertices, named, from ed8ebe2:
## [1] FORREST JENNY EMCEE MAN # MAN #1 MAN #2 MAN #3 MAN #5 MEN
# Determine all maximal cliques in the network and assign to object 'clq'
clq <- max_cliques(g)
# Calculate the size of each maximal clique.
table(unlist(lapply(clq, length)))
##
## 2 3 4 5 6 7 9
## 12 24 7 2 4 2 2
# Often in network visualization you will need to subset part of a network to inspect the inter-connections of particular vertices
# Here, you will create a visualization of the largest cliques in the Forrest Gump network
# In the last exercise you determined that there were two cliques of size 9
# You will plot these side-by-side after creating two new igraph objects by subsetting out these cliques from the main network
# The function subgraph() enables you to choose which vertices to keep in a new network object
# Assign largest cliques output to object 'lc'
lc <- largest_cliques(g)
# Create two new undirected subgraphs, each containing only the vertices of each largest clique.
gs1 <- as.undirected(induced_subgraph(g, lc[[1]]))
gs2 <- as.undirected(induced_subgraph(g, lc[[2]]))
# Plot the two largest cliques side-by-side
par(mfrow=c(1, 2)) # To plot two plots side-by-side
plot(gs1,
vertex.label.color = "black",
vertex.label.cex = 0.9,
vertex.size = 0,
edge.color = 'gray28',
main = "Largest Clique 1",
layout = layout.circle(gs1)
)
plot(gs2,
vertex.label.color = "black",
vertex.label.cex = 0.9,
vertex.size = 0,
edge.color = 'gray28',
main = "Largest Clique 2",
layout = layout.circle(gs2)
)
par(mfrow=c(1, 1)) # To return to the defaults
Chapter 4 - Identifying Special Relationships
Close relationships: assortativity and reciprocity:
Community detection - building on cliques, reciprocity, assortativity:
Interactive network visualizations:
Example code includes:
# Plot the network
plot(g1)
# Convert the gender attribute into a numeric value
values <- as.numeric(factor(V(g1)$gender))
# Calculate the assortativity of the network based on gender
assortativity(g1, values)
## [1] 0.1319444
# Calculate the assortativity degree of the network
assortativity.degree(g1, directed = FALSE)
## [1] 0.4615385
# Calculate the observed assortativity
observed.assortativity <- assortativity(g1, values)
# Calculate the assortativity of the network randomizing the gender attribute 1000 times
results <- vector('list', 1000)
for(i in 1:1000){
results[[i]] <- assortativity(g1, sample(values))
}
# Plot the distribution of assortativity values and add a red vertical line at the original observed value
hist(unlist(results))
abline(v = observed.assortativity, col = "red", lty = 3, lwd=2)
fromData <- c(1, 6, 8, 9, 11, 12, 3, 5, 7, 12, 2, 4, 5, 10, 13, 14, 3, 6, 9, 10, 13, 3, 8, 15, 1, 3, 9, 8, 9, 13, 5, 12, 13, 14, 1, 2, 3, 4, 7, 11, 12, 14, 3, 6, 14, 15, 4, 6, 9, 12, 3, 7, 8, 12, 14, 3, 4, 10, 11)
toData <- c(15, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 15, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 15, 13, 13, 13, 13, 13, 14, 14, 14, 14)
g <- graph_from_data_frame(data.frame(from=fromData, to=toData))
# Make a plot of the chimp grooming network
plot(g,
edge.color = "black",
edge.arrow.size = 0.3,
edge.arrow.width = 0.5)
# Calculate the reciprocity of the graph
reciprocity(g)
## [1] 0.2711864
# The first community detection method you will try is fast-greedy community detection
# You will use the Zachary Karate Club network
# This social network contains 34 club members and 78 edges
# Each edge indicates that those two club members interacted outside the karate club as well as at the club
# Perform fast-greedy community detection on network graph
fromData <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 9, 9, 9, 10, 14, 15, 15, 16, 16, 19, 19, 20, 21, 21, 23, 23, 24, 24, 24, 24, 24, 25, 25, 25, 26, 27, 27, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33)
toData <- c(2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 18, 20, 22, 32, 3, 4, 8, 14, 18, 20, 22, 31, 4, 8, 28, 29, 33, 10, 9, 14, 8, 13, 14, 7, 11, 7, 11, 17, 17, 31, 33, 34, 34, 34, 33, 34, 33, 34, 33, 34, 34, 33, 34, 33, 34, 26, 28, 33, 34, 30, 26, 28, 32, 32, 30, 34, 34, 32, 34, 33, 34, 33, 34, 33, 34, 34)
g <- graph_from_data_frame(data.frame(from=fromData, to=toData), directed=FALSE)
kc = fastgreedy.community(g)
# Determine sizes of each community
sizes(kc)
## Community sizes
## 1 2 3
## 8 17 9
# Determine which individuals belong to which community
membership(kc)
## 1 2 3 4 5 6 7 9 10 14 15 16 19 20 21 23 24 25 26 27 28 29 30 31 32
## 1 3 3 3 1 1 1 2 3 3 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## 33 8 11 12 13 18 22 17 34
## 2 3 1 1 3 3 3 1 2
# Plot the community structure of the network
plot(kc, g)
# Perform edge-betweenness community detection on network graph
gc = edge.betweenness.community(g)
# Determine sizes of each community
sizes(gc)
## Community sizes
## 1 2 3 4 5
## 10 6 5 12 1
# Plot community networks determined by fast-greedy and edge-betweenness methods side-by-side
par(mfrow = c(1, 2))
plot(kc, g)
plot(gc, g)
par(mfrow = c(1, 1))
# In this course you have exclusively used igraph to make basic static network plots
# There are many packages available to make network plots
# One very useful one is threejs which allows you to make interactive network visualizations
# This package also integrates seamlessly with igraph
# In this exercise you will make a basic interactive network plot of the karate club network using the threejs package
# Once you have produced the visualization be sure to move the network around with your mouse
# You should be able to scroll in and out of the network as well as rotate the network
library(igraph)
library(threejs)
# Set a vertex attribute called 'color' to 'dodgerblue'
g <- set_vertex_attr(g, "color", value = "dodgerblue")
# Redraw the graph and make the vertex size 1 (ActiveX does not work with browser)
# graphjs(g, vertex.size = 1)
# Create numerical vector of vertex eigenvector centralities
ec <- as.numeric(eigen_centrality(g)$vector)
# Create new vector 'v' that is equal to the square-root of 'ec' multiplied by 5
v <- 5*sqrt(ec)
# Plot threejs plot of graph setting vertex size to v (ActiveX does not work with browser)
# graphjs(g, vertex.size = v)
# Create an object 'i' containin the memberships of the fast-greedy community detection
i <- membership(kc)
# Check the number of different communities
sizes(kc)
## Community sizes
## 1 2 3
## 8 17 9
# Add a color attribute to each vertex, setting the vertex color based on community membership
g <- set_vertex_attr(g, "color", value = c("yellow", "blue", "red")[i])
# Plot the graph using threejs (ActiveX does not work with browser)
# graphjs(g)
Chapter 1 - Introduction
Problems in spatial statistics:
Simulation and testing with spatstat:
Further testing:
Example code includes:
# The number of points to create
n <- 200
# Set the range
xmin <- 0
xmax <- 1
ymin <- 0
ymax <- 2
# Sample from a Uniform distribution
x <- runif(n, xmin, xmax)
y <- runif(n, ymin, ymax)
# The ratio of the Y axis scale to the X axis scale is called the aspect ratio of the plot. Spatial data should always be presented with an aspect ratio of 1:1.
# See pre-defined variables
# ls.str()
# Plot points and a rectangle
mapxy <- function(a = NA){
plot(x, y, asp = a)
rect(xmin, ymin, xmax, ymax)
}
mapxy(1)
# How do we create a uniform density point pattern in a circle?
# We might first try selecting radius and angle uniformly. But that produces a "cluster" at small distances
# Instead we sample the radius from a non-uniform distribution that scales linearly with distance, so we have fewer points at small radii and more at large radii
# This exercise uses spatstat's disc() function, that creates a circular window.
# Load the spatstat package
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:forecast':
##
## getResponse
## The following object is masked from 'package:directlabels':
##
## gapply
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: rpart
##
## spatstat 1.55-0 (nickname: 'Stunned Mullet')
## For an introduction to spatstat, type 'beginner'
##
## Note: R version 3.3.3 (2017-03-06) is more than 9 months old; we strongly recommend upgrading to the latest version
##
## Attaching package: 'spatstat'
## The following object is masked from 'package:threejs':
##
## vertices
## The following objects are masked from 'package:igraph':
##
## diameter, edges, is.connected, vertices
## The following object is masked from 'package:lattice':
##
## panel.histogram
# Create this many points, in a circle of this radius
n_points <- 300
radius <- 10
# Generate uniform random numbers up to radius-squared
r_squared <- runif(n_points, 0, radius**2)
angle <- runif(n_points, 0, 2*pi)
# Take the square root of the values to get a uniform spatial distribution
x <- sqrt(r_squared) * cos(angle)
y <- sqrt(r_squared) * sin(angle)
plot(spatstat::disc(radius))
points(x, y)
# Some variables have been pre-defined
# ls.str()
# Set coordinates and window
ppxy <- ppp(x = x, y = y, window = disc(radius))
# Test the point pattern
qt <- quadrat.test(ppxy)
## Warning: Some expected counts are small; chi^2 approximation may be
## inaccurate
# Inspect the results
plot(qt)
print(qt)
##
## Chi-squared test of CSR using quadrat counts
## Pearson X2 statistic
##
## data: ppxy
## X2 = 26.268, df = 24, p-value = 0.6794
## alternative hypothesis: two.sided
##
## Quadrats: 25 tiles (irregular windows)
# In the previous exercise you used a set of 300 events scattered uniformly within a circle
# If you repeated the generation of the events again you will still have 300 of them, but in different locations
# The dataset of exactly 300 points is from a Poisson point process conditioned on the total being 300
# The spatstat package can generate Poisson spatial processes with the rpoispp() function given an intensity and a window, that are not conditioned on the total
# Just as the random number generator functions in R start with an "r", most of the random point-pattern functions in spatstat start with an "r".
# The area() function of spatstat will compute the area of a window such as a disc
# Create a disc of radius 10
disc10 <- disc(10)
# Compute the rate as count divided by area
lambda <- 500 / area(disc10)
# Create a point pattern object
ppois <- rpoispp(lambda = lambda, win = disc10)
# Plot the Poisson point pattern
plot(ppois)
# The spatstat package also has functions for generating point patterns from other process modelsparameters.
# These generally fall into one of two classes: clustered processes, where points occur together more than under a uniform Poisson process,
# and regular (aka inhibitory) processes where points are more spaced apart than under a uniform intensity Poisson process
# Some process models can generate patterns on a continuum from clustered through uniform to regular depending on their parameters
# The quadrat.test() function can test against clustered or regular alternative hypotheses
# By default it tests against either of those, but this can be changed with the alternative parameter to create a one-sided test.
# A Thomas process is a clustered pattern where a number of "parent" points, uniformly distributed, create a number of "child" points in their neighborhood
# The child points themselves form the pattern. This is an attractive point pattern, and makes sense for modelling things like trees, since new trees will grow near the original tree
# Random Thomas point patterns can be generated using rThomas()
# This takes three numbers that determine the intensity and clustering of the points, and a window object.
# Conversely the points of a Strauss process cause a lowering in the probability of finding another point nearby
# The parameters of a Strauss process can be such that it is a "hard-core" process, where no two points can be closer than a set threshold
# Creating points from this process involves some clever simulation algorithms
# This is a repulsive point pattern, and makes sense for modelling things like territorial animals, since the other animals of that species will avoid the territory of a given animal
# Random Strauss point patterns can be generated using rStrauss()
# This takes three numbers that determine the intensity and "territory" of the points, and a window object
# Points generated by a Strauss process are sometimes called regularly spaced.
# Create a disc of radius 10
disc10 <- disc(10)
# Generate clustered points from a Thomas process
set.seed(123)
p_cluster <- rThomas(kappa = 0.35, scale = 1, mu = 3, win = disc10)
plot(p_cluster)
# Run a quadrat test
quadrat.test(p_cluster, alternative = "clustered")
## Warning: Some expected counts are small; chi^2 approximation may be
## inaccurate
##
## Chi-squared test of CSR using quadrat counts
## Pearson X2 statistic
##
## data: p_cluster
## X2 = 53.387, df = 24, p-value = 0.0005142
## alternative hypothesis: clustered
##
## Quadrats: 25 tiles (irregular windows)
# Regular points from a Strauss process
set.seed(123)
p_regular <- rStrauss(beta = 2.9, gamma = 0.025, R = .5, W = disc10)
## Warning: Simulation will be performed in the containing rectangle and
## clipped to the original window.
plot(p_regular)
# Run a quadrat test
quadrat.test(p_regular, alternative = "regular")
## Warning: Some expected counts are small; chi^2 approximation may be
## inaccurate
##
## Chi-squared test of CSR using quadrat counts
## Pearson X2 statistic
##
## data: p_regular
## X2 = 8.57, df = 24, p-value = 0.001619
## alternative hypothesis: regular
##
## Quadrats: 25 tiles (irregular windows)
# Another way of assessing clustering and regularity is to consider each point, and how it relates to the other points
# One simple measure is the distribution of the distances from each point to its nearest neighbor
# The nndist() function in spatstat takes a point pattern and for each point returns the distance to its nearest neighbor
# Instead of working with the nearest-neighbor density, as seen in the histogram, it can be easier to work with the cumulative distribution function, G(r)
# This is the probability of a point having a nearest neighbour within a distance r
# For a uniform Poisson process, G can be computed theoretically, and is G(r) = 1 - exp( - lambda * pi * r ^ 2)
# You can compute G empirically from your data using Gest() and so compare with the theoretical value.
# Events near the edge of the window might have had a nearest neighbor outside the window, and so unobserved
# This will make the distance to its observed nearest neighbor larger than expected, biasing the estimate of G
# There are several methods for correcting this bias
# Plotting the output from Gest shows the theoretical cumulative distribution and several estimates of the cumulative distribution using different edge corrections
# Often these edge corrections are almost indistinguishable, and the lines overlap
# The plot can be used as a quick exploratory test of complete spatial randomness
# Two ppp objects, p_poisson, and p_regular are defined for you
# Point patterns are pre-defined
p_poisson <- ppois
p_poisson
## Planar point pattern: 501 points
## window: polygonal boundary
## enclosing rectangle: [-10, 10] x [-10, 10] units
p_regular
## Planar point pattern: 325 points
## window: polygonal boundary
## enclosing rectangle: [-10, 10] x [-10, 10] units
# Calc nearest-neighbor distances for Poisson point data
nnd_poisson <- nndist(p_poisson)
# Draw a histogram of nearest-neighbor distances
hist(nnd_poisson)
# Estimate G(r)
G_poisson <- Gest(p_poisson)
# Plot G(r) vs. r
plot(G_poisson)
# Repeat for regular point data
nnd_regular <- nndist(p_regular)
hist(nnd_regular)
G_regular <- Gest(p_regular)
plot(G_regular)
# A number of other functions of point patterns have been developed
# They are conventionally denoted by various capital letters, including F, H, J, K and L
# The K-function is defined as the expected number of points within a distance of a point of the process, scaled by the intensity
# Like G, this can be computed theoretically for a uniform Poisson process and is K(r) = pi * r ^ 2 - the area of a circle of that radius
# Deviation from pi * r ^ 2 can indicate clustering or point inhibition
# Computational estimates of K(r) are done using the Kest() function.
# As with G calculations, K-function calculations also need edge corrections
# The default edge correction in spatstat is generally the best, but can be slow, so we'll use the "border" correction for speed here
# Uncertainties on K-function estimates can be assessed by randomly sampling points from a uniform Poisson process in the area and computing the K-function of the simulated data
# Repeat this process 99 times, and take the minimum and maximum value of K over each of the distance values
# This gives an envelope - if the K-function from the data goes above the top of the envelope then we have evidence for clustering
# If the K-function goes below the envelope then there is evidence for an inhibitory process causing points to be spaced out
# Envelopes can be computed using the envelope() function
# The plot method for estimates of K uses a formula system where a dot on the left of a formula refers to K®
# So the default plot uses . ~ r
# You can compare the estimate of K to a Poisson process by plotting . - pi * r ^ 2 ~ r
# If the data was generated by a Poisson process, then the line should be close to zero for all values of r
# Point patterns are pre-defined
p_poisson
## Planar point pattern: 501 points
## window: polygonal boundary
## enclosing rectangle: [-10, 10] x [-10, 10] units
p_cluster
## Planar point pattern: 332 points
## window: polygonal boundary
## enclosing rectangle: [-10, 10] x [-10, 10] units
p_regular
## Planar point pattern: 325 points
## window: polygonal boundary
## enclosing rectangle: [-10, 10] x [-10, 10] units
# Estimate the K-function for the Poisson points
K_poisson <- Kest(p_poisson, correction = "border")
# The default plot shows quadratic growth
plot(K_poisson, . ~ r)
# Subtract pi * r ^ 2 from the Y-axis and plot
plot(K_poisson, . - pi * r**2 ~ r)
# Compute envelopes of K under random locations
K_cluster_env <- envelope(p_cluster, Kest, correction = "border")
## Generating 99 simulations of CSR ...
## 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
## 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
## 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99.
##
## Done.
# Insert the full formula to plot K minus pi * r^2
plot(K_cluster_env, . - pi * r^2 ~ r)
# Repeat for regular data
K_regular_env <- envelope(p_regular, Kest, correction = "border")
## Generating 99 simulations of CSR ...
## 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
## 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
## 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99.
##
## Done.
plot(K_regular_env, . - pi * r^2 ~ r)
Chapter 2 - Point Pattern Analysis
Bivariate point problems:
Spatial segregation:
Space-time data:
Space-time clustering:
Example code includes:
# The dataset we shall use for this example consists of crimes in a 4km radius of the center of Preston, a town in north-west England
# We want to look for hotspots of violent crime in the area
# A ppp object called preston_crime has been constructed
# This is a marked point process, where each point is marked as either a "Violent Crime" or a "Non-violent crime"
# The marks for each point can be retrieved using the marks() function
# The window is a 4km circle centered on the town center
# A map image of the town from OpenStreetMap has also been loaded, called preston_osm
preston_crime <- readRDS("./RInputFiles/pcrime-spatstat.RDS")
preston_osm <- readRDS("./RInputFiles/osm_preston_gray.RDS")
# Get some summary information on the dataset
summary(preston_crime)
## Marked planar point pattern: 2036 points
## Average intensity 4.053214e-05 points per square unit
##
## Coordinates are given to 2 decimal places
## i.e. rounded to the nearest multiple of 0.01 units
##
## Multitype:
## frequency proportion intensity
## Non-violent crime 1812 0.8899804 3.607281e-05
## Violent crime 224 0.1100196 4.459332e-06
##
## Window: polygonal boundary
## single connected closed polygon with 99 vertices
## enclosing rectangle: [349773, 357771] x [425706.5, 433705.5] units
## Window area = 50231700 square units
## Fraction of frame area: 0.785
# Get a table of marks
table(marks(preston_crime))
##
## Non-violent crime Violent crime
## 1812 224
# Define a function to create a map
preston_map <- function(cols = c("green","red"), cex = c(1, 1), pch = c(1, 1)) {
raster::plotRGB(preston_osm) # from the raster package
plot(preston_crime, cols = cols, pch = pch, cex = cex, add = TRUE, show.window = TRUE)
}
# Draw the map with colors, sizes and plot character
preston_map(
cols = c("black", "red"),
cex = c(0.5, 1),
pch = 19
)
# One method of computing a smooth intensity surface from a set of points is to use kernel smoothing
# Imagine replacing each point with a dot of ink on absorbent paper
# Each individual ink drop spreads out into a patch with a dark center, and multiple drops add together and make the paper even darker
# With the right amount of ink in each drop, and with paper of the right absorbency, you can create a fair impression of the density of the original points
# In kernel smoothing jargon, this means computing a bandwidth and using a particular kernel function
# To get a smooth map of violent crimes proportion, we can estimate the intensity surface for violent and non-violent crimes, and take the ratio
# To do this with the density() function in spatstat, we have to split the points according to the two values of the marks and then compute the ratio of the violent crime surface to the total
# The function has sensible defaults for the kernel function and bandwidth to guarantee something that looks at least plausible
# preston_crime has been pre-defined
preston_crime
## Marked planar point pattern: 2036 points
## Multitype, with levels = Non-violent crime, Violent crime
## window: polygonal boundary
## enclosing rectangle: [349773, 357771] x [425706.5, 433705.5] units
# Use the split function to show the two point patterns
crime_splits <- split(preston_crime)
# Plot the split crime
plot(crime_splits)
# Compute the densities of both sets of points
crime_densities <- density(crime_splits)
# Calc the violent density divided by the sum of both
frac_violent_crime_density <- crime_densities[[2]] /
(crime_densities[[1]] + crime_densities[[2]])
# Plot the density of the fraction of violent crime
plot(frac_violent_crime_density)
# We can get a more principled measure of the violent crime ratio using a spatial segregation model
# The spatialkernel package implements the theory of spatial segregation
# The first step is to compute the optimal bandwidth for kernel smoothing under the segregation model
# A small bandwidth would result in a density that is mostly zero, with spikes at the event locations
# A large bandwidth would flatten out any structure in the events, resulting in a large "blob" across the whole window
# Somewhere between these extremes is a bandwidth that best represents an underlying density for the process
# spseg() will scan over a range of bandwidths and compute a test statistic using a cross-validation method
# The bandwidth that maximizes this test statistic is the one to use
# The returned value from spseg() in this case is a list, with h and cv elements giving the values of the statistic over the input h values
# The spatialkernel package supplies a plotcv function to show how the test value varies
# The hcv element has the value of the best bandwidth
# spatstat is loaded and the preston_crime object is read in
# Scan from 500m to 1000m in steps of 50m
bw_choice <- spatialkernel::spseg(
preston_crime,
h = seq(500, 1000, by = 50),
opt = 1)
##
## Calculating cross-validated likelihood function
# Plot the results and highlight the best bandwidth
spatialkernel::plotcv(bw_choice)
abline(v = bw_choice$hcv, lty = 2, col = "red")
# Print the best bandwidth
print(bw_choice$hcv)
## [1] 800
# The second step is to compute the probabilities for violent and non-violent crimes as a smooth surface, as well as the p-values for a point-wise test of segregation
# This is done by calling spseg() with opt = 3 and a fixed bandwidth parameter h
# Normally you would run this process for at least 100 simulations, but that will take too long to run here
# Instead, run for only 10 simulations
# Then you can use a pre-loaded object seg which is the output from a 1000 simulation run that took about 20 minutes to complete
# Set the correct bandwidth and run for 10 simulations only
seg10 <- spatialkernel::spseg(
pts = preston_crime,
h = bw_choice$hcv,
opt = 3,
ntest = 10,
proc = FALSE)
# Plot the segregation map for violent crime
spatialkernel::plotmc(seg10, "Violent crime")
# Plot seg, the result of running 1000 simulations (not included here)
# spatialkernel::plotmc(seg, "Violent crime")
# With a base map and some image and contour functions we can display both the probabilities and the significance tests over the area with more control than the plotmc() function.
# The seg object is a list with several components
# The X and Y coordinates of the grid are stored in the $gridx and $gridy elements
# The probabilities of each class of data (violent or non-violent crime) are in a matrix element $p with a column for each class
# The p-value of the significance test is in a similar matrix element called $stpvalue
# Rearranging columns of these matrices into a grid of values can be done with R's matrix() function
# From there you can construct list objects with a vector $x of X-coordinates, $y of Y-coordinates, and $z as the matrix
# You can then feed this to image() or contour() for visualization
# This process may seem complex, but remember that with R you can always write functions to perform complex tasks and those you may repeat often
# For example, to help with the mapping in this exercise you will create a function that builds a map from four different items
# The seg object from 1000 simulations is loaded, as well as the preston_crime points and the preston_osm map image
# Inspect the structure of the spatial segregation object
# str(seg)
# Get the number of columns in the data so we can rearrange to a grid
# ncol <- length(seg$gridx)
# Rearrange the probability column into a grid
# prob_violent <- list(x = seg$gridx,
# y = seg$gridy,
# z = matrix(seg$p[, "Violent crime"],
# ncol = ncol))
# image(prob_violent)
# Rearrange the p-values, but choose a p-value threshold
# p_value <- list(x = seg$gridx,
# y = seg$gridy,
# z = matrix(seg$stpvalue[, "Violent crime"] < 0.05,
# ncol = ncol))
# image(p_value)
# Create a mapping function
# segmap <- function(prob_list, pv_list, low, high){
#
# # background map
# plotRGB(preston_osm)
#
# # p-value areas
# image(pv_list,
# col = c("#00000000", "#FF808080"), add = TRUE)
#
# # probability contours
# contour(prob_list,
# levels = c(low, high),
# col = c("#206020", "red"),
# labels = c("Low", "High"),
# add = TRUE)
#
# # boundary window
# plot(Window(preston_crime), add = TRUE)
# }
#
# # Map the probability and p-value
# segmap(prob_violent, p_value, 0.05, 0.15)
# The sasquatch, or "bigfoot", is a large ape-like creature reported to live in North American forests
# The Bigfoot Field Researchers Organization maintains a database of sightings and allows its use for teaching and research
# A cleaned subset of data in north-west USA has been created as the ppp object sasq and is loaded for you to explore the space-time pattern of sightings in the area
# Get a quick summary of the dataset
sasq <- readRDS("./RInputFiles/sasquatch.RDS")
summary(sasq)
## Marked planar point pattern: 423 points
## Average intensity 2.097156e-09 points per square unit
##
## *Pattern contains duplicated points*
##
## Coordinates are given to 1 decimal place
## i.e. rounded to the nearest multiple of 0.1 units
##
## Mark variables: date, year, month
## Summary:
## date year month
## Min. :1990-05-03 Y2004 : 41 Sep : 59
## 1st Qu.:2000-04-30 Y2000 : 36 Oct : 56
## Median :2003-11-05 Y2002 : 30 Aug : 54
## Mean :2003-08-11 Y2005 : 30 Jul : 50
## 3rd Qu.:2007-11-02 Y2001 : 26 Nov : 43
## Max. :2016-04-05 Y2008 : 26 Jun : 41
## (Other):234 (Other):120
##
## Window: polygonal boundary
## single connected closed polygon with 64 vertices
## enclosing rectangle: [368187.8, 764535.6] x [4644873, 5434933] units
## Window area = 2.01702e+11 square units
## Fraction of frame area: 0.644
# Plot unmarked points
plot(unmark(sasq))
# Plot the points using a circle sized by date
plot(sasq, which.marks = "date")
# Show the available marks
names(marks(sasq))
## [1] "date" "year" "month"
# Histogram the dates of the sightings, grouped by year
hist(marks(sasq)$date, "years", freq = TRUE)
# Plot and tabulate the calendar month of all the sightings
plot(table(marks(sasq)$month))
# Split on the month mark
sasq_by_month <- split(sasq, "month", un = TRUE)
# Plot monthly maps
plot(sasq_by_month)
# Plot smoothed versions of the above split maps
plot(density(sasq_by_month))
# To do a space-time clustering test with stmctest() from the splancs package, you first need to convert parts of your ppp object
# Functions in splancs tend to use matrix data instead of data frames.
# To run stmctest() you need to set up
# event locations
# event times
# region polygon
# time limits
# the time and space ranges for analysis
# The sasq object is loaded and the spatstat and splancs packages are ready for use
# Get a matrix of event coordinates
sasq_xy <- as.matrix(coords(sasq))
# Check the matrix has two columns
dim(sasq_xy)
## [1] 423 2
# Get a vector of event times
sasq_t <- marks(sasq)$date
# Extract a two-column matrix from the ppp object
sasq_poly <- as.matrix(as.data.frame(Window(sasq)))
dim(sasq_poly)
## [1] 64 2
# Set the time limit to 1 day before and 1 day after the range of times
tlimits <- range(sasq_t) + c(-1, 1)
# Scan over 400m intervals from 100m to 20km
s <- seq(100, 20000, by = 400)
# Scan over 14 day intervals from one week to 31 weeks
tm <- seq(7, 217, by = 14)
# Everything is now ready for you to run the space-time clustering test function
# You can then plot the results and compute a p-value for rejecting the null hypothesis of no space-time clustering
# Any space-time clustering in a data set will be removed if you randomly rearrange the dates of the data points
# The stmctest() function computes a clustering test statistic for your data based on the space-time K-function - how many points are within a spatial and temporal window of a point of the data
# It then does a number of random rearrangements of the dates among the points and computes the clustering statistic
# After doing this a large number of times, you can compare the test statistic for your data with the values from the random data
# If the test statistic for your data is sufficiently large or small, you can reject the null hypothesis of no space-time clustering
# The output from stmctest() is a list with a single t0 which is the test statistic for your data, and a vector of t from the simulations
# By converting to data frame you can feed this to ggplot functions
# Because the window area is a large number of square meters, and we have about 400 events, the numerical value of the intensity is a very small number
# This makes values of the various K-functions very large numbers, since they are proportional to the inverse of the intensity
# Don't worry if you see 10^10 or higher
# The p-value of a Monte-Carlo test like this is just the proportion of test statistics that are larger than the value from the data
# You can compute this from the t and t0 elements of the output
# All the objects from the previous exercise are loaded.
# Run 999 simulations
sasq_mc <- splancs::stmctest(sasq_xy, sasq_t, sasq_poly, tlimits, s, tm, nsim = 999, quiet = TRUE)
names(sasq_mc)
## [1] "t0" "t"
# Histogram the simulated statistics and add a line at the data value
ggplot(data.frame(sasq_mc), aes(x = t)) +
geom_histogram(binwidth = 1e13) +
geom_vline(aes(xintercept = t0))
# Compute the p-value as the proportion of tests greater than the data
sum(sasq_mc$t > sasq_mc$t0) / 1000
## [1] 0.04
Chapter 3 - Areal Statistics
Areal statistics:
Spatial health data:
Generalized linear models in space:
Correlation in spatial GLM:
Example code includes:
library(cartogram)
library(rgeos)
## rgeos version: 0.3-26, (SVN revision 560)
## GEOS runtime version: 3.6.1-CAPI-1.10.1 r0
## Linking to sp version: 1.2-7
## Polygon checking: TRUE
library(spdep)
## Loading required package: sp
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge')`
##
## Attaching package: 'spData'
## The following objects are masked _by_ '.GlobalEnv':
##
## x, y
library(epitools)
library(R2BayesX)
## Loading required package: BayesXsrc
## Loading required package: colorspace
##
## Attaching package: 'colorspace'
## The following object is masked from 'package:spatstat':
##
## coords
## Loading required package: mgcv
## This is mgcv 1.8-17. For overview type 'help("mgcv-package")'.
# In 2016 the UK held a public vote on whether to remain in the European Union
# The results of the referendum, where people voted either "Remain" or "Leave", are available online
# The data set london_ref contains the results for the 32 boroughs of London, and includes the number and percentage of votes in each category as well as the count of spoilt votes, the population size and the electorate size
# The london_ref object is a SpatialPolygonsDataFrame, a special kind of data frame where each row also has the shape of the borough
# It behaves like a data frame in many respects, but can also be used to plot a choropleth, or shaded polygon, map
# You should start with some simple data exploration and mapping. The following variables will be useful:
# NAME : the name of the borough.
# Electorate : the total number of people who can vote.
# Remain, Leave : the number of votes for "Remain" or "Leave".
# Pct_Remain, Pct_Leave : the percentage of votes for each sid
# spplot() from the raster package provides a convenient way to draw a shaded map of regions
# See what information we have for each borough
london_ref <- readRDS("./RInputFiles/london_eu.RDS")
summary(london_ref)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x 503574.2 561956.7
## y 155850.8 200933.6
## Is projected: TRUE
## proj4string :
## [+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000
## +y_0=-100000 +datum=OSGB36 +units=m +no_defs +ellps=airy
## +towgs84=446.448,-125.157,542.060,0.1502,0.2470,0.8421,-20.4894]
## Data attributes:
## NAME TOTAL_POP Electorate Votes_Cast
## Length:32 Min. :157711 Min. : 83042 Min. : 54801
## Class :character 1st Qu.:237717 1st Qu.:143458 1st Qu.:104079
## Mode :character Median :272017 Median :168394 Median :116280
## Mean :270780 Mean :169337 Mean :118025
## 3rd Qu.:316911 3rd Qu.:196285 3rd Qu.:134142
## Max. :379691 Max. :245349 Max. :182570
## Remain Leave Rejected_Ballots Pct_Remain
## Min. : 27750 Min. :17138 Min. : 60.0 Min. :30.34
## 1st Qu.: 55973 1st Qu.:32138 1st Qu.:105.0 1st Qu.:53.69
## Median : 70254 Median :45263 Median :138.0 Median :61.01
## Mean : 70631 Mean :47255 Mean :139.0 Mean :60.46
## 3rd Qu.: 84287 3rd Qu.:59018 3rd Qu.:164.2 3rd Qu.:69.90
## Max. :118463 Max. :96885 Max. :267.0 Max. :78.62
## Pct_Leave Pct_Rejected Assembly
## Min. :21.38 Min. :0.0600 Length:32
## 1st Qu.:30.10 1st Qu.:0.0875 Class :character
## Median :38.99 Median :0.1100 Mode :character
## Mean :39.54 Mean :0.1187
## 3rd Qu.:46.31 3rd Qu.:0.1500
## Max. :69.66 Max. :0.2200
# Which boroughs voted to "Leave"?
london_ref$NAME[london_ref$Leave > london_ref$Remain]
## [1] "Sutton" "Barking and Dagenham" "Bexley"
## [4] "Havering" "Hillingdon"
# Plot a map of the percentage that voted "Remain"
sp::spplot(london_ref, zcol = "Pct_Remain")
# Large areas, such as cities or countries, are often divided into smaller administrative units, often into zones of approximately equal population
# But the area of those units may vary considerably
# When mapping them, the large areas carry more visual "weight" than small areas, although just as many people live in the small areas.
# One technique for correcting for this is the cartogram
# This is a controlled distortion of the regions, expanding some and contracting others, so that the area of each region is proportional to a desired quantity, such as the population
# The cartogram also tries to maintain the correct geography as much as possible, by keeping regions in roughly the same place relative to each other
# The cartogram package contains functions for creating cartograms
# You give it a spatial data frame and the name of a column, and you get back a similar data frame but with regions distorted so that the region area is proportional to the column value of the regions
# You'll also use the rgeos package for computing the areas of individual regions with the gArea() function
# Use the cartogram and rgeos packages (called at top of routine)
# library(cartogram)
# library(rgeos)
# Make a scatterplot of electorate vs borough area
names(london_ref)
## [1] "NAME" "TOTAL_POP" "Electorate"
## [4] "Votes_Cast" "Remain" "Leave"
## [7] "Rejected_Ballots" "Pct_Remain" "Pct_Leave"
## [10] "Pct_Rejected" "Assembly"
plot(london_ref$Electorate, gArea(london_ref, byid = TRUE))
# Make a cartogram, scaling the area to the electorate
carto_ref <- cartogram(london_ref, "Electorate")
## Mean size error for iteration 1: 1.5881743190908
## Mean size error for iteration 2: 1.32100446455657
## Mean size error for iteration 3: 1.18227723476121
## Mean size error for iteration 4: 1.10676057030171
## Mean size error for iteration 5: 1.0657703107641
## Mean size error for iteration 6: 1.04259259672006
## Mean size error for iteration 7: 1.02832326230708
## Mean size error for iteration 8: 1.01931941526112
## Mean size error for iteration 9: 1.01341424685212
## Mean size error for iteration 10: 1.00941370606418
## Mean size error for iteration 11: 1.00663364742297
## Mean size error for iteration 12: 1.00470553629914
## Mean size error for iteration 13: 1.00336434720465
## Mean size error for iteration 14: 1.00241457265516
## Mean size error for iteration 15: 1.00174179254187
plot(carto_ref)
# Check the linearity of the electorate-area plot
plot(carto_ref$Electorate, gArea(carto_ref, byid = TRUE))
# Make a fairer map of the Remain percentage
sp::spplot(carto_ref, "Pct_Remain")
# The map of "Remain" votes seems to have spatial correlation
# Pick any two boroughs that are neighbors - with a shared border - and the chances are they'll be more similar than any two random boroughs
# This can be a problem when using statistical models that assume, conditional on the model, that the data points are independent
# The spdep package has functions for measures of spatial correlation, also known as spatial dependency
# Computing these measures first requires you to work out which regions are neighbors via the poly2nb() function, short for "polygons to neighbors"
# The result is an object of class nb
# Then you can compute the test statistic and run a significance test on the null hypothesis of no spatial correlation
# The significance test can either be done by Monte-Carlo or theoretical models
# In this example you'll use the Moran "I" statistic to test the spatial correlation of the population and the percentage "Remain" vote.
# The london_ref spatial data object is loaded for you
# Use the spdep package (called at top of routine)
# library(spdep)
# Make neighbor list
borough_nb <- poly2nb(london_ref)
# Get center points of each borough
borough_centers <- coordinates(london_ref)
# Show the connections
plot(london_ref)
plot(borough_nb, borough_centers, add = TRUE)
# Map the total pop'n
sp::spplot(london_ref, zcol = "TOTAL_POP")
# Run a Moran I test on total pop'n
moran.test(
london_ref$TOTAL_POP,
nb2listw(borough_nb)
)
##
## Moran I test under randomisation
##
## data: london_ref$TOTAL_POP
## weights: nb2listw(borough_nb)
##
## Moran I statistic standard deviate = 1.2124, p-value = 0.1127
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.11549264 -0.03225806 0.01485190
# Map % Remain
sp::spplot(london_ref, zcol = "Pct_Remain")
# Run a Moran I MC test on % Remain
moran.mc(
london_ref$Pct_Remain,
nb2listw(borough_nb),
nsim = 999
)
##
## Monte-Carlo simulation of Moran I
##
## data: london_ref$Pct_Remain
## weights: nb2listw(borough_nb)
## number of simulations + 1: 1000
##
## statistic = 0.42841, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
# The UK's National Health Service publishes weekly data for consultations at a number of "sentinel" clinics and makes this data available
# A dataset for one week in February 2017 has been loaded for you to analyze
# It is called london, and contains data for the 32 boroughs.
# You will focus on reports of "Influenza-like illness", or more simply "Flu"
# Your first task is to map the "Standardized Morbidity Ratio", or SMR
# This is the number of cases per person, but scaled by the overall incidence so that the expected number is 1
# The london object, a spatial data frame, and the sp package are ready for you
# Get a summary of the data set
london <- readRDS("./RInputFiles/london_2017_2.RDS")
summary(london)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x 503574.2 561956.7
## y 155850.8 200933.6
## Is projected: TRUE
## proj4string :
## [+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000
## +y_0=-100000 +datum=OSGB36 +units=m +no_defs +ellps=airy
## +towgs84=446.448,-125.157,542.060,0.1502,0.2470,0.8421,-20.4894]
## Data attributes:
## CODE NAME Flu_OBS Vom_OBS
## Length:32 Length:32 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 11.00 1st Qu.:14.00
## Mode :character Mode :character Median : 33.00 Median :40.00
## Mean : 38.56 Mean :37.28
## 3rd Qu.: 61.00 3rd Qu.:57.50
## Max. :112.00 Max. :96.00
## Diarr_OBS Gastro_OBS TOTAL_POP CCGcode
## Min. : 0.00 Min. : 0.0 Min. :157711 Length:32
## 1st Qu.: 22.50 1st Qu.: 48.0 1st Qu.:237717 Class :character
## Median : 62.00 Median :120.5 Median :272017 Mode :character
## Mean : 57.03 Mean :113.7 Mean :270780
## 3rd Qu.: 90.75 3rd Qu.:176.8 3rd Qu.:316911
## Max. :122.00 Max. :251.0 Max. :379691
## CCG.geography.code CCG.name Asthma_Prevalence
## Length:32 Length:32 Min. :3.550
## Class :character Class :character 1st Qu.:4.412
## Mode :character Mode :character Median :4.660
## Mean :4.624
## 3rd Qu.:4.925
## Max. :5.470
## Obesity_Prevalence Cancer_Prevalence Diabetes_Prevalence Income
## Min. : 3.930 Min. :0.870 Min. :3.620 Min. :0.0730
## 1st Qu.: 5.855 1st Qu.:1.438 1st Qu.:5.265 1st Qu.:0.1308
## Median : 7.565 Median :1.605 Median :6.305 Median :0.1665
## Mean : 7.585 Mean :1.684 Mean :6.245 Mean :0.1655
## 3rd Qu.: 8.810 3rd Qu.:1.903 3rd Qu.:7.067 3rd Qu.:0.1985
## Max. :12.130 Max. :2.540 Max. :9.060 Max. :0.2530
## Employment Education HealthDeprivation Crime
## Min. :0.0570 Min. : 3.958 Min. :-1.4100 Min. :-0.1550
## 1st Qu.:0.0905 1st Qu.:10.047 1st Qu.:-0.5055 1st Qu.: 0.3745
## Median :0.1095 Median :13.925 Median :-0.2050 Median : 0.5515
## Mean :0.1092 Mean :14.024 Mean :-0.2044 Mean : 0.5379
## 3rd Qu.:0.1283 3rd Qu.:17.480 3rd Qu.: 0.2010 3rd Qu.: 0.7823
## Max. :0.1560 Max. :27.182 Max. : 0.5430 Max. : 1.0190
## Services Environment i
## Min. :19.63 Min. :13.37 Min. : 0.00
## 1st Qu.:24.43 1st Qu.:24.03 1st Qu.: 7.75
## Median :30.41 Median :28.20 Median :15.50
## Mean :29.55 Mean :31.38 Mean :15.50
## 3rd Qu.:34.74 3rd Qu.:40.15 3rd Qu.:23.25
## Max. :41.89 Max. :55.00 Max. :31.00
# Map the OBServed number of flu reports
sp::spplot(london, "Flu_OBS")
# Compute and print the overall incidence of flu
r <- sum(london$Flu_OBS) / sum(london$TOTAL_POP)
r
## [1] 0.0001424128
# Calculate the expected number for each borough
london$Flu_EXP <- london$TOTAL_POP * r
# Calculate the ratio of OBServed to EXPected
london$Flu_SMR <- london$Flu_OBS / london$Flu_EXP
# Map the SMR
sp::spplot(london, "Flu_SMR")
# SMRs above 1 represent high rates of disease - but how high does an SMR need to be before it can be considered statistically significant?
# Given a number of cases and a population, its possible to work out confidence intervals at some level of the estimate of the ratio of cases per population using the properties of the binomial distribution
# The epitools package has a function binom.exact() which you can use to compute confidence intervals for the flu data
# These can be scaled to be confidence intervals on the SMR by dividing by the overall rate
# The london data set and the sp package are loaded
# For the binomial statistics function (called at top of routine)
# library(epitools)
# Get CI from binomial distribution
flu_ci <- binom.exact(london$Flu_OBS, london$TOTAL_POP)
# Add borough names
flu_ci$NAME <- london$NAME
# Calculate London rate, then compute SMR
r <- sum(london$Flu_OBS) / sum(london$TOTAL_POP)
flu_ci$SMR <- flu_ci$proportion / r
# Subset the high SMR data
flu_high <- flu_ci[flu_ci$SMR > 1, ]
# Plot estimates with CIs
ggplot(flu_high, aes(x = NAME, y = proportion / r, ymin = lower / r, ymax = upper / r)) +
geom_pointrange() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Distributions and confidence intervals can be difficult things to present to non-statisticians
# An alternative is to present a probability that a value is over a threshold
# For example, public health teams might be interested in when an SMR has more than doubled, and as a statistician you can give a probability that this has happened
# Then the public health team might decide to go to some alert level when the probability of a doubling of SMR is over 0.95
# Again, the properties of the binomial distribution let you compute this for proportional data
# You can then map these exceedence probabilities for some threshold, and use a sensible color scheme to highlight probabilities close to 1
# The london data set has been loaded, and the expected flu case count, Flu_EXP has been computed
# Probability of a binomial exceeding a multiple
binom.exceed <- function(observed, population, expected, e){
1 - pbinom(e * expected, population, prob = observed / population)
}
# Compute P(rate > 2)
london$Flu_gt_2 <- binom.exceed(
observed = london$Flu_OBS,
population = london$TOTAL_POP,
expected = london$Flu_EXP,
e = 2)
# Use a 50-color palette that only starts changing at around 0.9
pal <- c(
rep("#B0D0B0", 40),
colorRampPalette(c("#B0D0B0", "orange"))(5),
colorRampPalette(c("orange", "red"))(5)
)
# Plot the P(rate > 2) map
sp::spplot(london, "Flu_gt_2", col.regions = pal, at = seq(0, 1, len = 50))
# A Poisson generalized linear model is a way of fitting count data to explanatory variables
# You get out parameter estimates and standard errors for your explanatory variables, and can get fitted values and residuals
# The glm() function fits Poisson GLMs. It works just like the lm() function, but you also specify a family argument
# The formula has the usual meaning - response on the left of the ~, and explanatory variables on the right
# To cope with count data coming from populations of different sizes, you specify an offset argument
# This adds a constant term for each row of the data in the model. The log of the population is used in the offset term
# The london health data set has been loaded with the sp package also ready.
# Run a Poisson generalized linear model of how the count of flu cases, Flu_OBS, depends on the Health Deprivation index value, HealthDeprivation
# The first argument is the formula (response vairable on the left)
# The family argument is a function, poisson
# Fit a poisson GLM.
model_flu <- glm(
Flu_OBS ~ HealthDeprivation,
offset = log(TOTAL_POP),
data = london,
family = "poisson")
# Is HealthDeprivation significant?
summary(model_flu)
##
## Call:
## glm(formula = Flu_OBS ~ HealthDeprivation, family = "poisson",
## data = london, offset = log(TOTAL_POP))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -9.5361 -4.5285 -0.0499 2.9043 8.2194
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.78190 0.02869 -306.043 <2e-16 ***
## HealthDeprivation 0.65689 0.06797 9.665 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 703.75 on 31 degrees of freedom
## Residual deviance: 605.03 on 30 degrees of freedom
## AIC: 762.37
##
## Number of Fisher Scoring iterations: 5
# Put residuals into the spatial data.
london$Flu_Resid <- residuals(model_flu)
# Map the residuals using spplot
sp::spplot(london, "Flu_Resid")
# A linear model should fit the data and leave uncorrelated residuals
# This applies to non-spatial models, where, for example, fitting a straight line through points on a curve would lead to serially-correlated residuals
# A model on spatial data should aim to have residuals that show no significant spatial correlation
# You can test the model fitted to the flu data using moran.mc() from the spdep package
# Monte Carlo Moran tests were previously discussed in the Spatial autocorrelation test exercise earlier in the chapter
# Compute the neighborhood structure.
borough_nb <- poly2nb(london)
# Test spatial correlation of the residuals.
moran.mc(london$Flu_Resid, listw = nb2listw(borough_nb), nsim = 999)
##
## Monte-Carlo simulation of Moran I
##
## data: london$Flu_Resid
## weights: nb2listw(borough_nb)
## number of simulations + 1: 1000
##
## statistic = 0.15059, observed rank = 925, p-value = 0.075
## alternative hypothesis: greater
# Bayesian statistical models return samples of the parameters of interest (the "posterior" distribution) based on some "prior" distribution which is then updated by the data
# The Bayesian modelling process returns a number of samples from which you can compute the mean, or an exceedence probability, or any other quantity you might compute from a distribution
# Before you fit a model with spatial correlation, you'll first fit the same model as before, but using Bayesian inference
# The london data set has been loaded
# The R2BayesX package provides an interface to the BayesX code.
# The syntax for bayesx() is similar, but the offset has to be specified explicitly from the data frame, the family name is in quotes, and the spatial data frame needs to be turned into a plain data frame
# Run the model fitting and inspect with summary()
# Plot the samples from the Bayesian model
# On the left is the "trace" of samples in sequential order, and on the right is the parameter density
# For this model there is an intercept and a slope for the Health Deprivation score
# The parameter density should correspond with the parameter summary
# Use R2BayesX (called at top of routine)
# library(R2BayesX)
# Fit a GLM
model_flu <- glm(Flu_OBS ~ HealthDeprivation, offset = log(TOTAL_POP),
data = london, family = poisson)
# Summarize it
summary(model_flu)
##
## Call:
## glm(formula = Flu_OBS ~ HealthDeprivation, family = poisson,
## data = london, offset = log(TOTAL_POP))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -9.5361 -4.5285 -0.0499 2.9043 8.2194
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.78190 0.02869 -306.043 <2e-16 ***
## HealthDeprivation 0.65689 0.06797 9.665 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 703.75 on 31 degrees of freedom
## Residual deviance: 605.03 on 30 degrees of freedom
## AIC: 762.37
##
## Number of Fisher Scoring iterations: 5
# Calculate coeff confidence intervals
confint(model_flu)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -8.838677 -8.7261843
## HealthDeprivation 0.524437 0.7908841
# Fit a Bayesian GLM
bayes_flu <- bayesx(Flu_OBS ~ HealthDeprivation, offset = log(london$TOTAL_POP),
family = "poisson", data = as.data.frame(london),
control = bayesx.control(seed = 17610407))
# Summarize it
summary(bayes_flu)
## Call:
## bayesx(formula = Flu_OBS ~ HealthDeprivation, data = as.data.frame(london),
## offset = log(london$TOTAL_POP), control = bayesx.control(seed = 17610407),
## family = "poisson")
##
## Fixed effects estimation results:
##
## Parametric coefficients:
## Mean Sd 2.5% 50% 97.5%
## (Intercept) -8.7831 0.0278 -8.8371 -8.7841 -8.7263
## HealthDeprivation 0.6592 0.0659 0.5345 0.6587 0.7900
##
## N = 32 burnin = 2000 method = MCMC family = poisson
## iterations = 12000 step = 10
# Look at the samples from the Bayesian model
plot(samples(bayes_flu))
# You've fitted a non-spatial GLM with BayesX
# You can include a spatially correlated term based on the adjacency structure by adding a term to the formula specifying a spatially correlated model
# Use poly2nb() to compute the neighborhood structure of london to an nb object
# R2BayesX uses its own objects for the adjacency. Convert the nb object to a gra object
# The sx function specifies additional terms to bayesx. Create a term using the "spatial" basis and the gra object for the boroughs to define the map
# Print a summary of the model object. You should see a table of coefficients for the parametric part of the model as in the previous exercise, and then a table of "Smooth terms variance" with one row for the spatial term
# Note that since BayesX can fit many different forms in its sx terms, most of which, like the spatial model here, cannot be simply expressed with a parameter or two
# This table shows the variance of the random effects - for further explanation consult a text on random effects modelling
# Compute adjacency objects
borough_nb <- poly2nb(london)
borough_gra <- nb2gra(borough_nb)
# Fit spatial model
flu_spatial <- bayesx(
Flu_OBS ~ HealthDeprivation + sx(i, bs = "spatial", map = borough_gra),
offset = log(london$TOTAL_POP),
family = "poisson", data = data.frame(london),
control = bayesx.control(seed = 17610407)
)
## Note: created new output directory 'C:/Users/Dave/AppData/Local/Temp/Rtmp4cxBim/bayesx1'!
# Summarize the model
summary(flu_spatial)
## Call:
## bayesx(formula = Flu_OBS ~ HealthDeprivation + sx(i, bs = "spatial",
## map = borough_gra), data = data.frame(london), offset = log(london$TOTAL_POP),
## control = bayesx.control(seed = 17610407), family = "poisson")
##
## Fixed effects estimation results:
##
## Parametric coefficients:
## Mean Sd 2.5% 50% 97.5%
## (Intercept) -9.2311 0.1246 -9.4826 -9.2298 -9.0148
## HealthDeprivation 0.7683 0.5844 -0.3749 0.7775 1.7922
##
## Smooth terms variances:
## Mean Sd 2.5% 50% 97.5% Min Max
## sx(i):mrf 4.6381 1.6822 2.2851 4.3510 8.8104 1.6557 16.266
##
## N = 32 burnin = 2000 method = MCMC family = poisson
## iterations = 12000 step = 10
# As with glm, you can get the fitted values and residuals from your model using the fitted and residuals functions. bayesx models are a bit more complex, since you have the linear predictor and terms from sx elements, such as the spatially correlated term
# The summary function will show you information for the linear model terms and the smoothing terms in two separate tables
# The spatial term is called "sx(i):mrf" - standing for "Markov Random Field"
# Bayesian analysis returns samples from a distribution for our S(x) term at each of the London boroughs
# The fitted function from bayesx models returns summary statistics for each borough
# You'll just look at the mean of that distribution for now
# The model from the BayesX output is available as flu_spatial.
# Summarise the model
summary(flu_spatial)
## Call:
## bayesx(formula = Flu_OBS ~ HealthDeprivation + sx(i, bs = "spatial",
## map = borough_gra), data = data.frame(london), offset = log(london$TOTAL_POP),
## control = bayesx.control(seed = 17610407), family = "poisson")
##
## Fixed effects estimation results:
##
## Parametric coefficients:
## Mean Sd 2.5% 50% 97.5%
## (Intercept) -9.2311 0.1246 -9.4826 -9.2298 -9.0148
## HealthDeprivation 0.7683 0.5844 -0.3749 0.7775 1.7922
##
## Smooth terms variances:
## Mean Sd 2.5% 50% 97.5% Min Max
## sx(i):mrf 4.6381 1.6822 2.2851 4.3510 8.8104 1.6557 16.266
##
## N = 32 burnin = 2000 method = MCMC family = poisson
## iterations = 12000 step = 10
# Map the fitted spatial term only
london$spatial <- fitted(flu_spatial, term = "sx(i):mrf")[, "Mean"]
sp::spplot(london, zcol = "spatial")
# Map the residuals
london$spatial_resid <- residuals(flu_spatial)[, "mu"]
sp::spplot(london, zcol = "spatial_resid")
# Test residuals for spatial correlation
moran.mc(london$spatial_resid, nb2listw(borough_nb), 999)
##
## Monte-Carlo simulation of Moran I
##
## data: london$spatial_resid
## weights: nb2listw(borough_nb)
## number of simulations + 1: 1000
##
## statistic = -0.26922, observed rank = 16, p-value = 0.984
## alternative hypothesis: greater
Chapter 4 - Geostatistics